From 323663bb115ef625642391a5a8e9b35fee8b2ae3 Mon Sep 17 00:00:00 2001
From: Hyeongseok Oh <hseok82.oh@samsung.com>
Date: Wed, 12 Apr 2023 15:42:02 +0900
Subject: Imported Upstream version 1.22.0

---
 runtime/onert/api/include/nnfw_version.h           |   2 +-
 runtime/onert/api/src/nnfw_api_internal.cc         | 370 +++------
 runtime/onert/api/src/nnfw_api_internal.h          |   9 +-
 runtime/onert/backend/acl_cl/KernelGenerator.cc    |   2 +-
 .../onert/backend/acl_common/AclTensorBuilder.h    |  20 +-
 .../cl_common/include/cl_common/BackendContext.h   |   2 +-
 runtime/onert/backend/cpu/BackendContext.cc        |   2 +-
 runtime/onert/backend/cpu/KernelGenerator.cc       |  18 +-
 runtime/onert/backend/gpu_cl/Backend.h             |  16 +-
 runtime/onert/backend/gpu_cl/BackendContext.cc     |  26 +
 runtime/onert/backend/gpu_cl/BackendContext.h      |   2 +
 runtime/onert/backend/gpu_cl/CMakeLists.txt        |  43 +-
 runtime/onert/backend/gpu_cl/ClFunction.h          |  32 +-
 runtime/onert/backend/gpu_cl/Config.h              |   3 -
 runtime/onert/backend/gpu_cl/KernelGenerator.cc    | 612 ++++++++------
 runtime/onert/backend/gpu_cl/KernelGenerator.h     |  15 +-
 runtime/onert/backend/gpu_cl/MemoryManager.h       | 115 +--
 runtime/onert/backend/gpu_cl/TensorBuilder.cc      |  24 +-
 runtime/onert/backend/gpu_cl/TensorBuilder.h       |   7 +-
 runtime/onert/backend/gpu_cl/TensorBuilderHelper.h |  44 -
 runtime/onert/backend/gpu_cl/TensorManager.cc      |  38 +-
 runtime/onert/backend/gpu_cl/TensorManager.h       |  22 +-
 runtime/onert/backend/gpu_cl/TensorRegistry.h      |   2 +-
 runtime/onert/backend/gpu_cl/Utils.h               | 155 ++++
 .../onert/backend/gpu_cl/ex/InferenceContextEx.h   | 108 ---
 runtime/onert/backend/gpu_cl/operand/CLTensor.cc   |   8 +-
 runtime/onert/backend/gpu_cl/operand/CLTensor.h    |   4 +-
 runtime/onert/backend/gpu_cl/operand/ICLTensor.cc  |  40 +-
 runtime/onert/backend/gpu_cl/operand/ICLTensor.h   |  49 +-
 runtime/onert/backend/ruy/BackendContext.cc        |   2 +-
 runtime/onert/backend/trix/BackendContext.cc       |   2 +-
 runtime/onert/backend/trix/BatchThreadPool.cc      |  69 ++
 runtime/onert/backend/trix/BatchThreadPool.h       | 130 +++
 runtime/onert/backend/trix/Convert.cc              |  54 ++
 runtime/onert/backend/trix/Convert.h               |  93 +++
 runtime/onert/backend/trix/DevContext.cc           | 307 +++++++
 runtime/onert/backend/trix/DevContext.h            | 197 +++--
 runtime/onert/backend/trix/KernelGenerator.cc      |   4 +-
 runtime/onert/backend/trix/ops/BulkLayer.cc        | 137 +---
 runtime/onert/backend/trix/ops/BulkLayer.h         |   3 +-
 runtime/onert/backend/xnnpack/BackendContext.cc    |   2 +-
 runtime/onert/core/CMakeLists.txt                  |   2 +-
 .../include/backend/basic/BackendContextHelpers.h  |  18 +-
 runtime/onert/core/include/compiler/Compiler.h     | 107 +--
 .../onert/core/include/compiler/CompilerFactory.h  |  47 ++
 .../onert/core/include/compiler/CompilerOptions.h  |  91 +++
 runtime/onert/core/include/compiler/ICompiler.h    |  63 ++
 runtime/onert/core/include/compiler/LoweredGraph.h |   5 -
 .../core/include/compiler/StaticShapeInferer.h     |   9 +
 runtime/onert/core/include/exec/Execution.h        | 119 +--
 runtime/onert/core/include/exec/Executors.h        |  71 --
 runtime/onert/core/include/exec/FunctionSequence.h |   2 +-
 runtime/onert/core/include/exec/IExecutor.h        |  17 +-
 runtime/onert/core/include/exec/IExecutors.h       |  98 +++
 runtime/onert/core/include/ir/Graph.h              |  36 -
 runtime/onert/core/include/ir/Index.h              |   4 +-
 runtime/onert/core/include/ir/NNPkg.h              | 102 ++-
 .../onert/core/include/ir/OperandIndexSequence.h   |   7 -
 runtime/onert/core/include/ir/Shape.h              |   6 +-
 runtime/onert/core/include/util/Config.lst         |   1 -
 runtime/onert/core/include/util/Index.h            |   7 +
 runtime/onert/core/include/util/ObjectManager.h    |   4 +-
 runtime/onert/core/include/util/Utils.h            |  37 +-
 .../onert/core/src/backend/basic/MemoryManager.cc  |   2 +-
 .../onert/core/src/backend/basic/MemoryPlanner.cc  |   2 +-
 .../core/src/backend/basic/StaticTensorManager.cc  |   2 +-
 .../core/src/backend/builtin/BackendContext.cc     |   2 +-
 runtime/onert/core/src/backend/builtin/IOTensor.h  |   2 +-
 .../core/src/backend/builtin/KernelGenerator.cc    |   8 +-
 .../core/src/backend/builtin/KernelGenerator.h     |   9 +-
 .../core/src/backend/builtin/kernel/IfLayer.cc     |   8 +-
 .../core/src/backend/builtin/kernel/IfLayer.h      |   8 +-
 .../src/backend/builtin/kernel/PermuteLayer.cc     |   8 +-
 .../core/src/backend/builtin/kernel/WhileLayer.cc  |  10 +-
 .../core/src/backend/builtin/kernel/WhileLayer.h   |   8 +-
 runtime/onert/core/src/compiler/Compiler.cc        | 772 ++----------------
 runtime/onert/core/src/compiler/CompilerFactory.cc |  45 +
 runtime/onert/core/src/compiler/CompilerOptions.cc | 145 ++++
 runtime/onert/core/src/compiler/ExecutorFactory.cc |  51 +-
 runtime/onert/core/src/compiler/ExecutorFactory.h  |  25 +-
 .../onert/core/src/compiler/Fp32ToFp16Converter.cc |  24 +-
 runtime/onert/core/src/compiler/HEScheduler.cc     |   2 +-
 .../onert/core/src/compiler/HEScheduler.test.cc    |   4 +-
 runtime/onert/core/src/compiler/LoweredGraph.cc    |   8 -
 runtime/onert/core/src/compiler/ManualScheduler.cc |   4 +-
 .../onert/core/src/compiler/MultiModelCompiler.cc  | 214 +++++
 .../onert/core/src/compiler/MultiModelCompiler.h   |  75 ++
 .../onert/core/src/compiler/StaticShapeInferer.cc  |  98 ++-
 runtime/onert/core/src/compiler/TensorRegistries.h |   2 +-
 .../onert/core/src/compiler/pass/OddOutputPass.cc  |   4 +-
 runtime/onert/core/src/compiler/pass/PassRunner.cc |   2 +-
 .../src/compiler/pass/PermutationInsertionPass.cc  |   4 +-
 runtime/onert/core/src/exec/Execution.cc           | 247 +-----
 runtime/onert/core/src/exec/Execution.test.cc      | 337 +++++++-
 runtime/onert/core/src/exec/ExecutionObservee.cc   |   8 +-
 runtime/onert/core/src/exec/ExecutionObservers.h   |   2 +-
 runtime/onert/core/src/exec/ExecutorBase.cc        |  57 +-
 runtime/onert/core/src/exec/ExecutorBase.h         |  13 +-
 runtime/onert/core/src/exec/Executors.cc           | 672 ++++++++++++---
 runtime/onert/core/src/exec/Executors.h            | 169 ++++
 runtime/onert/core/src/exec/IPermuteFunction.cc    | 320 ++++++++
 runtime/onert/core/src/exec/IPermuteFunction.h     |  99 +--
 .../onert/core/src/exec/IPermuteFunction.test.cc   | 902 +++++++++++++++++++++
 runtime/onert/core/src/exec/ParallelScheduler.cc   |   2 +-
 .../onert/core/src/exec/SingleModelExecutors.cc    |  61 ++
 runtime/onert/core/src/exec/SingleModelExecutors.h |  70 ++
 runtime/onert/core/src/exec/ThreadPool.cc          |   2 +-
 runtime/onert/core/src/interp/Buffer.h             |  91 ---
 runtime/onert/core/src/interp/ExecEnv.h            | 212 -----
 runtime/onert/core/src/interp/InterpExecutor.cc    | 127 ---
 runtime/onert/core/src/interp/InterpExecutor.h     |  89 --
 .../onert/core/src/interp/InterpExecutor.test.cc   | 355 --------
 runtime/onert/core/src/interp/InterpOps.lst        |  73 --
 runtime/onert/core/src/interp/Interpreter.cc       | 184 -----
 runtime/onert/core/src/interp/Interpreter.h        |  64 --
 runtime/onert/core/src/interp/Registration.h       |  43 -
 runtime/onert/core/src/interp/Tensor.cc            |  57 --
 runtime/onert/core/src/interp/Tensor.h             | 189 -----
 .../src/interp/operations/BinaryArithmeticOps.cc   | 204 -----
 runtime/onert/core/src/interp/operations/Concat.cc | 147 ----
 runtime/onert/core/src/interp/operations/Conv2D.cc | 151 ----
 .../core/src/interp/operations/DepthwiseConv2D.cc  | 156 ----
 .../interp/operations/ElementwiseActivations.cc    | 160 ----
 .../core/src/interp/operations/FullyConnected.cc   | 134 ---
 runtime/onert/core/src/interp/operations/Gather.cc | 138 ----
 .../core/src/interp/operations/InstanceNorm.cc     | 121 ---
 .../core/src/interp/operations/OperationUtil.h     | 203 -----
 runtime/onert/core/src/interp/operations/Pad.cc    | 106 ---
 runtime/onert/core/src/interp/operations/Pool2D.cc | 140 ----
 .../onert/core/src/interp/operations/Reshape.cc    |  63 --
 .../onert/core/src/interp/operations/Softmax.cc    | 123 ---
 .../core/src/interp/operations/TransposeConv.cc    | 141 ----
 runtime/onert/core/src/ir/Shape.cc                 |   8 +-
 runtime/onert/core/src/ir/Shape.test.cc            |   2 +-
 .../core/src/util/ChromeTracingEventWriter.cc      |   6 +-
 runtime/onert/core/src/util/MDTableEventWriter.cc  |   8 +-
 runtime/onert/core/src/util/SNPEEventWriter.cc     |  22 +-
 runtime/onert/core/src/util/ShapeInference.cc      |   6 +-
 .../frontend/base_loader/include/base_loader.h     |  30 +-
 runtime/onert/frontend/circle/src/circle_loader.cc |   4 +-
 runtime/onert/frontend/nnapi/CMakeLists.txt        |   2 +-
 runtime/onert/frontend/nnapi/compilation.cc        |   4 +-
 runtime/onert/frontend/nnapi/execution.cc          |   2 +-
 .../nnapi/wrapper/ANeuralNetworksCompilation.cc    |   5 +-
 .../nnapi/wrapper/ANeuralNetworksCompilation.h     |   6 +-
 .../nnapi/wrapper/ANeuralNetworksExecution.h       |   2 +-
 runtime/onert/frontend/tflite/src/tflite_loader.cc |   5 +-
 147 files changed, 5448 insertions(+), 6287 deletions(-)
 delete mode 100644 runtime/onert/backend/gpu_cl/TensorBuilderHelper.h
 create mode 100644 runtime/onert/backend/gpu_cl/Utils.h
 delete mode 100644 runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h
 create mode 100644 runtime/onert/backend/trix/BatchThreadPool.cc
 create mode 100644 runtime/onert/backend/trix/BatchThreadPool.h
 create mode 100644 runtime/onert/backend/trix/Convert.cc
 create mode 100644 runtime/onert/backend/trix/Convert.h
 create mode 100644 runtime/onert/backend/trix/DevContext.cc
 create mode 100644 runtime/onert/core/include/compiler/CompilerFactory.h
 create mode 100644 runtime/onert/core/include/compiler/CompilerOptions.h
 create mode 100644 runtime/onert/core/include/compiler/ICompiler.h
 delete mode 100644 runtime/onert/core/include/exec/Executors.h
 create mode 100644 runtime/onert/core/include/exec/IExecutors.h
 create mode 100644 runtime/onert/core/src/compiler/CompilerFactory.cc
 create mode 100644 runtime/onert/core/src/compiler/CompilerOptions.cc
 create mode 100644 runtime/onert/core/src/compiler/MultiModelCompiler.cc
 create mode 100644 runtime/onert/core/src/compiler/MultiModelCompiler.h
 create mode 100644 runtime/onert/core/src/exec/Executors.h
 create mode 100644 runtime/onert/core/src/exec/IPermuteFunction.cc
 create mode 100644 runtime/onert/core/src/exec/IPermuteFunction.test.cc
 create mode 100644 runtime/onert/core/src/exec/SingleModelExecutors.cc
 create mode 100644 runtime/onert/core/src/exec/SingleModelExecutors.h
 delete mode 100644 runtime/onert/core/src/interp/Buffer.h
 delete mode 100644 runtime/onert/core/src/interp/ExecEnv.h
 delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.cc
 delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.h
 delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.test.cc
 delete mode 100644 runtime/onert/core/src/interp/InterpOps.lst
 delete mode 100644 runtime/onert/core/src/interp/Interpreter.cc
 delete mode 100644 runtime/onert/core/src/interp/Interpreter.h
 delete mode 100644 runtime/onert/core/src/interp/Registration.h
 delete mode 100644 runtime/onert/core/src/interp/Tensor.cc
 delete mode 100644 runtime/onert/core/src/interp/Tensor.h
 delete mode 100644 runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Concat.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Conv2D.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/FullyConnected.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Gather.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/InstanceNorm.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/OperationUtil.h
 delete mode 100644 runtime/onert/core/src/interp/operations/Pad.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Pool2D.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Reshape.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/Softmax.cc
 delete mode 100644 runtime/onert/core/src/interp/operations/TransposeConv.cc

(limited to 'runtime/onert')

diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 2fbb96f31..be30ee296 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01001500
+#define NNFW_VERSION 0x01001600
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index 9b43dd381..8eedb5314 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -16,7 +16,7 @@
 
 #include "nnfw_api_internal.h"
 #include "CustomKernelRegistry.h"
-#include "compiler/Compiler.h"
+#include "compiler/CompilerFactory.h"
 #include "util/ConfigSource.h"
 #include "util/Exceptions.h"
 #include "util/logging.h"
@@ -208,29 +208,24 @@ NNFW_STATUS nnfw_session::create(nnfw_session **session)
 {
   if (session == nullptr)
     return NNFW_STATUS_UNEXPECTED_NULL;
-
-  // Create session
-  *session = new (std::nothrow) nnfw_session();
-  if (*session == nullptr)
+  try
   {
-    std::cerr << "Error during session creation" << std::endl;
-    return NNFW_STATUS_OUT_OF_MEMORY;
+    auto new_session = std::unique_ptr<nnfw_session>(new nnfw_session());
+    new_session->_kernel_registry = std::make_shared<onert::api::CustomKernelRegistry>();
+    *session = new_session.release();
   }
-
-  // Initialize fields
-  try
+  catch (const std::bad_alloc &e)
   {
-    (*session)->_kernel_registry = std::make_shared<onert::api::CustomKernelRegistry>();
+    std::cerr << "Error during session creation" << std::endl;
+    *session = nullptr; // Set nullptr on error to keep the old behavior
+    return NNFW_STATUS_OUT_OF_MEMORY;
   }
   catch (const std::exception &e)
   {
     std::cerr << "Error during session initialization : " << e.what() << std::endl;
-    delete *session;
-    *session = nullptr;
-
+    *session = nullptr; // Set nullptr on error to keep the old behavior
     return NNFW_STATUS_ERROR;
   }
-
   return NNFW_STATUS_NO_ERROR;
 }
 
@@ -331,7 +326,6 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
     std::string manifest_file_name = package_path + "/metadata/MANIFEST";
     std::ifstream mfs(manifest_file_name);
 
-    _package_file_path = package_path;
     // extract the filename of the first(index 0) model
     // e.g. In MANIFEST file, { "models" : [ "firstmodel.tflite", "2nd.tflite" ] }
     Json::Value root;
@@ -351,7 +345,14 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
       }
     }
     _nnpkg = std::make_shared<onert::ir::NNPkg>();
-    for (uint32_t i = 0; i < models.size(); ++i)
+    auto num_models = models.size();
+    if (num_models == 0 || (num_models - 1) > onert::ir::ModelIndex::max())
+    {
+      std::cerr << "Invalid model size - " << std::to_string(num_models) << std::endl;
+      return NNFW_STATUS_ERROR;
+    }
+
+    for (uint16_t i = 0; i < num_models; ++i)
     {
       auto model_file_path = package_path + std::string("/") + models[i].asString();
       auto model_type = model_types[i].asString();
@@ -390,6 +391,8 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
       for (uint32_t j = 0; j < tos.size(); ++j)
         _nnpkg->addEdge(toIODesc(fromtos[i]["from"].asString()), toIODesc(tos[j].asString()));
     }
+
+    _nnpkg->verify();
     _state = State::MODEL_LOADED;
   }
   catch (const std::exception &e)
@@ -420,14 +423,7 @@ NNFW_STATUS nnfw_session::prepare()
 
   try
   {
-    // TODO: Compile all models in case of multiple models
-    if (_nnpkg->model_count() > 2)
-    {
-      std::cerr << "Error during model prepare : more than 3 multiple models are not supported yet."
-                << std::endl;
-      return NNFW_STATUS_ERROR;
-    }
-    auto compiler = std::make_unique<onert::compiler::Compiler>(_nnpkg, _coptions);
+    auto compiler = onert::compiler::CompilerFactory::get().create(_nnpkg, _coptions);
     _nnpkg.reset();
     _compiler_artifact = compiler->compile();
     _execution = std::make_unique<onert::exec::Execution>(_compiler_artifact->_executors);
@@ -442,50 +438,10 @@ NNFW_STATUS nnfw_session::prepare()
   return NNFW_STATUS_NO_ERROR;
 }
 
-NNFW_STATUS nnfw_session::prepare_pipeline(const char *map_file_path)
+NNFW_STATUS nnfw_session::prepare_pipeline(const char *)
 {
-  // NOTE. If users want to run prepare_pipeline() more than one time, this could be removed.
-  if (!isStateModelLoaded())
-  {
-    std::cerr << "Error during model prepare pipeline : ";
-    if (isStateInitialized())
-    {
-      std::cerr << "prepare_pipeline should be run once";
-    }
-    else
-    {
-      std::cerr << "invalid state";
-    }
-    std::cerr << std::endl;
-    return NNFW_STATUS_INVALID_STATE;
-  }
-
-  try
-  {
-    auto model = _nnpkg->primary_model();
-    auto compiler = std::make_unique<onert::compiler::Compiler>(model, *_coptions[0]);
-    _nnpkg.reset();
-    auto artifacts = compiler->compile(_package_file_path.c_str(), map_file_path);
-
-    for (auto it = artifacts.begin(); it != artifacts.end(); ++it)
-    {
-      _executions.push_back(std::make_shared<onert::exec::Execution>(it->get()->_executors));
-    }
-    make_dependency();
-    _threads.resize(_executions.size());
-    for (uint32_t i = 0; i < _threads.size(); i++)
-    {
-      _threads[i] = std::thread(&onert::exec::Execution::runInference, _executions[i].get());
-    }
-  }
-  catch (const std::exception &e)
-  {
-    std::cerr << "Error during model prepare : " << e.what() << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
-  _state = State::PREPARED;
-  return NNFW_STATUS_NO_ERROR;
+  std::cerr << "Pipeline prepare_pipeline: deprecated feature " << std::endl;
+  return NNFW_STATUS_ERROR;
 }
 
 NNFW_STATUS nnfw_session::run()
@@ -497,12 +453,6 @@ NNFW_STATUS nnfw_session::run()
     return NNFW_STATUS_INVALID_STATE;
   }
 
-  if (!_executions.empty())
-  {
-    std::cerr << "Error during nnfw_session::run : not supported for pipeline run" << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   try
   {
     _execution->execute();
@@ -532,13 +482,6 @@ NNFW_STATUS nnfw_session::run_async()
     return NNFW_STATUS_INVALID_STATE;
   }
 
-  if (!_executions.empty())
-  {
-    std::cerr << "Error during nnfw_session::run_async : not supported for pipeline run"
-              << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   _execution->startExecute();
 
   _state = State::RUNNING;
@@ -554,12 +497,6 @@ NNFW_STATUS nnfw_session::await()
     return NNFW_STATUS_ERROR;
   }
 
-  if (!_executions.empty())
-  {
-    std::cerr << "Error during nnfw_session::await : not supported for pipeline run" << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   _execution->waitFinish();
 
   _state = State::FINISHED_RUN;
@@ -583,13 +520,6 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
     return NNFW_STATUS_ERROR;
   }
 
-  if (!_executions.empty())
-  {
-    std::cerr << "Error during nnfw_session::set_input : not supported for pipeline run"
-              << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   try
   {
     _execution->setInput(onert::ir::IOIndex(index), buffer, length);
@@ -619,13 +549,6 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
     return NNFW_STATUS_ERROR;
   }
 
-  if (!_executions.empty())
-  {
-    std::cerr << "Error during nnfw_session::set_output : not supported for pipeline run"
-              << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   try
   {
     _execution->setOutput(onert::ir::IOIndex(index), buffer, length);
@@ -650,7 +573,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
       std::cerr << "Error during nnfw_session::input_size, number is null pointer." << std::endl;
       return NNFW_STATUS_UNEXPECTED_NULL;
     }
-    *number = primary_subgraph()->getInputs().size();
+    *number = getInputSize();
   }
   catch (const std::exception &e)
   {
@@ -672,7 +595,7 @@ NNFW_STATUS nnfw_session::output_size(uint32_t *number)
       std::cerr << "Error during nnfw_session::output_size, number is null pointer." << std::endl;
       return NNFW_STATUS_UNEXPECTED_NULL;
     }
-    *number = primary_subgraph()->getOutputs().size();
+    *number = getOutputSize();
   }
   catch (const std::exception &e)
   {
@@ -684,6 +607,13 @@ NNFW_STATUS nnfw_session::output_size(uint32_t *number)
 
 NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout)
 {
+  if (!isStatePreparedOrFinishedRun())
+  {
+    std::cerr << "Error during nnfw_session::set_input_layout : "
+              << "run should be run after prepare" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
   try
   {
     if (layout != NNFW_LAYOUT_NONE && layout != NNFW_LAYOUT_CHANNELS_FIRST &&
@@ -692,14 +622,8 @@ NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout)
       std::cerr << "Error during nnfw_session::set_input_layout, not supported layout" << std::endl;
       return NNFW_STATUS_ERROR;
     }
-    if (_execution)
-    {
-      _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
-    }
-    else
-    {
-      _executions.at(0)->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
-    }
+
+    _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
   }
   catch (const std::exception &e)
   {
@@ -711,6 +635,13 @@ NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout)
 
 NNFW_STATUS nnfw_session::set_output_layout(uint32_t index, NNFW_LAYOUT layout)
 {
+  if (!isStatePreparedOrFinishedRun())
+  {
+    std::cerr << "Error during nnfw_session::set_output_layout : "
+              << "run should be run after prepare" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
   try
   {
     if (layout != NNFW_LAYOUT_NONE && layout != NNFW_LAYOUT_CHANNELS_FIRST &&
@@ -720,15 +651,8 @@ NNFW_STATUS nnfw_session::set_output_layout(uint32_t index, NNFW_LAYOUT layout)
                 << std::endl;
       return NNFW_STATUS_ERROR;
     }
-    if (_execution)
-    {
-      _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
-    }
-    else
-    {
-      _executions.at(_executions.size() - 1)
-        ->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
-    }
+
+    _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
   }
   catch (const std::exception &e)
   {
@@ -771,27 +695,13 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
 
   if (!isStatePreparedOrFinishedRun())
   {
-    // In this case, if we apply input shape in primary_subgraph, it will propagate after
-    // compilation and excution
-    auto model = _nnpkg->primary_model();
-    auto primary_subgraph = model->primary_subgraph();
-    auto ind = primary_subgraph->getInputs().at(index);
-    auto &input = primary_subgraph->operands().at(ind);
 
-    // overwrite input shape with the shape from ti
-    input.info().shape(new_shape);
+    // In this case, if we apply input shape, it will propagate after compilation and excution
+    auto &info = _nnpkg->inputInfo(index);
+    info.shape(new_shape);
   }
   else // when called after nnfw_session::prepare()
-  {
-    if (_execution)
-    {
-      _execution->changeInputShape(onert::ir::IOIndex(index), new_shape);
-    }
-    else
-    {
-      _executions.at(0)->changeInputShape(onert::ir::IOIndex(index), new_shape);
-    }
-  }
+    _execution->changeInputShape(onert::ir::IOIndex(index), new_shape);
 
   return NNFW_STATUS_NO_ERROR;
 }
@@ -815,21 +725,26 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
                 << std::endl;
       return NNFW_STATUS_UNEXPECTED_NULL;
     }
-    if (index >= primary_subgraph()->getInputs().size())
+
+    if (index >= getInputSize())
     {
       std::cerr << "Error during nnfw_session::input_tensorinfo, index is out of range."
                 << std::endl;
       return NNFW_STATUS_ERROR;
     }
-    auto opidx = primary_subgraph()->getInputs().at(index);
-    auto shape = primary_subgraph()->operands().at(opidx).shape();
-    if (isStatePreparedOrFinishedRun())
+
+    if (isStateModelLoaded())
+    {
+      auto info = _nnpkg->inputInfo(index);
+      fillTensorInfo(ti, info.shape(), info.typeInfo().type());
+    }
+    else
     {
-      shape = _execution ? _execution->getInputShape(onert::ir::IOIndex{index})
-                         : _executions.at(0)->getInputShape(onert::ir::IOIndex{index});
+      auto io_index = onert::ir::IOIndex{index};
+      auto shape = _execution->getInputShape(io_index);
+      auto dtype = _compiler_artifact->_executors->inputInfo(io_index).typeInfo().type();
+      fillTensorInfo(ti, shape, dtype);
     }
-    auto dtype = primary_subgraph()->operands().at(opidx).typeInfo().type();
-    fillTensorInfo(ti, shape, dtype);
   }
   catch (const std::exception &e)
   {
@@ -851,26 +766,27 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
     return NNFW_STATUS_UNEXPECTED_NULL;
   }
 
-  if (index >= primary_subgraph()->getOutputs().size())
-  {
-    std::cerr << "Error during nnfw_session::output_tensorinfo, index is out of range."
-              << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   try
   {
-    auto opidx = primary_subgraph()->getOutputs().at(index);
-    auto shape = primary_subgraph()->operands().at(opidx).shape();
-    // If it is called after `nnfw_run` then get the shape from Execution, not from the graph
-    if (isStateFinishedRun())
+    if (index >= getOutputSize())
+    {
+      std::cerr << "Error during nnfw_session::output_tensorinfo, index is out of range."
+                << std::endl;
+      return NNFW_STATUS_ERROR;
+    }
+
+    if (isStateModelLoaded())
     {
-      shape = _execution
-                ? _execution->getOutputShape(onert::ir::IOIndex{index})
-                : _executions.at(_executions.size() - 1)->getOutputShape(onert::ir::IOIndex{index});
+      auto info = _nnpkg->outputInfo(index);
+      fillTensorInfo(ti, info.shape(), info.typeInfo().type());
+    }
+    else
+    {
+      auto io_index = onert::ir::IOIndex{index};
+      auto shape = _execution->getOutputShape(io_index);
+      auto dtype = _compiler_artifact->_executors->outputInfo(io_index).typeInfo().type();
+      fillTensorInfo(ti, shape, dtype);
     }
-    auto dtype = primary_subgraph()->operands().at(opidx).typeInfo().type();
-    fillTensorInfo(ti, shape, dtype);
   }
   catch (const std::exception &e)
   {
@@ -881,86 +797,16 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
   return NNFW_STATUS_NO_ERROR;
 }
 
-void nnfw_session::make_dependency()
+NNFW_STATUS nnfw_session::push_pipeline_input(std::vector<void *> *, std::vector<uint32_t> *)
 {
-  for (uint32_t out_exe = 0; out_exe < _executions.size(); out_exe++)
-  {
-    auto &out_graph = _executions[out_exe]->primary_subgraph();
-    for (uint32_t in_exe = 0; in_exe < _executions.size(); in_exe++)
-    {
-      if (out_exe == in_exe)
-        continue;
-      auto &in_graph = _executions[in_exe]->primary_subgraph();
-      for (auto out = out_graph._name_to_output_begin(); out != out_graph._name_to_output_end();
-           out++)
-      {
-        auto out_opidx = out_graph.getOutputs().at(out->second);
-        auto out_shape = out_graph.operands().at(out_opidx).shape();
-        for (auto in = in_graph._name_to_input_begin(); in != in_graph._name_to_input_end(); in++)
-        {
-          if (out->first != in->first)
-            continue;
-
-          auto in_opidx = in_graph.getInputs().at(in->second);
-          auto in_shape = in_graph.operands().at(in_opidx).shape();
-          if (out_shape.rank() != in_shape.rank())
-            continue;
-
-          bool is_same = true;
-          for (int32_t i = 0; i < out_shape.rank(); i++)
-          {
-            if (out_shape.dim(i) != in_shape.dim(i))
-            {
-              is_same = false;
-              break;
-            }
-          }
-
-          if (is_same)
-            _executions[out_exe]->pushNextExe(_executions[in_exe], out->second, in->second);
-        }
-      }
-    }
-  }
-}
-
-NNFW_STATUS nnfw_session::push_pipeline_input(std::vector<void *> *inputs,
-                                              std::vector<uint32_t> *lengths)
-{
-  static uint32_t count = 0;
-  if (inputs->empty())
-  {
-    _executions[0]->setFinish();
-    for (uint32_t i = 0; i < _threads.size(); i++)
-    {
-      _threads[i].join();
-    }
-    return NNFW_STATUS_NO_ERROR;
-  }
-  _executions[0]->asyncIoDescSemWait();
-  _executions[0]->createNewAsyncDesc(count++);
-  for (uint32_t i = 0; i < inputs->size(); i++)
-  {
-    _executions[0]->executeAsyncInput(onert::ir::IOIndex(i), inputs->at(i), lengths->at(i));
-  }
-  _executions[0]->asyncIoDescSemPost();
-  return NNFW_STATUS_NO_ERROR;
+  std::cerr << "Pipeline push_pipeline_input: deprecated feature " << std::endl;
+  return NNFW_STATUS_ERROR;
 }
 
-NNFW_STATUS nnfw_session::pop_pipeline_output(std::vector<void *> *outputs)
+NNFW_STATUS nnfw_session::pop_pipeline_output(std::vector<void *> *)
 {
-  auto results = _executions[_executions.size() - 1]->getAsyncResults();
-  while (results->empty())
-  {
-    if (_executions[_executions.size() - 1]->stopWait())
-      return NNFW_STATUS_ERROR;
-  }
-
-  auto result = results->front();
-  results->pop_front();
-  for (uint32_t i = 0; i < result.size(); i++)
-    outputs->push_back(result[i]);
-  return NNFW_STATUS_NO_ERROR;
+  std::cerr << "Pipeline pop_pipeline_output: deprecated feature " << std::endl;
+  return NNFW_STATUS_ERROR;
 }
 
 NNFW_STATUS nnfw_session::register_custom_operation(const std::string &id,
@@ -1088,10 +934,6 @@ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
   {
     options.he_profiling_mode = toBool(value);
   }
-  else if (skey == config::DISABLE_COMPILE)
-  {
-    options.disable_compile = toBool(value);
-  }
   else
   {
     return NNFW_STATUS_ERROR;
@@ -1103,23 +945,41 @@ const onert::ir::Graph *nnfw_session::primary_subgraph()
 {
   if (_nnpkg != nullptr)
   {
-    assert(_execution == nullptr && _executions.empty());
+    assert(_execution == nullptr);
     return _nnpkg->primary_model()->primary_subgraph().get();
   }
   else
   {
-    assert(_execution != nullptr || !_executions.empty());
-    // TODO Remove const_cast
+    assert(_execution != nullptr);
     // We assumed the graph will not change after compilation, but shape could change
-    if (!_executions.empty())
-    {
-      return &_executions[0]->primary_parentgraph();
-    }
-
     return &_execution->primary_subgraph();
   }
 }
 
+uint32_t nnfw_session::getInputSize()
+{
+  if (isStateInitialized())
+    throw std::runtime_error{"Model is not loaded yet"};
+
+  if (isStateModelLoaded())
+    return _nnpkg->inputSize();
+
+  // Session is prepared (general inference)
+  return _compiler_artifact->_executors->inputSize();
+}
+
+uint32_t nnfw_session::getOutputSize()
+{
+  if (isStateInitialized())
+    throw std::runtime_error{"Model is not loaded yet"};
+
+  if (isStateModelLoaded())
+    return _nnpkg->outputSize();
+
+  // Session is prepared (general inference)
+  return _compiler_artifact->_executors->outputSize();
+}
+
 NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
 {
   if (!isStateModelLoaded())
@@ -1174,7 +1034,7 @@ bool nnfw_session::isStateInitialized()
   {
     assert(_nnpkg == nullptr);
     assert(_coptions.empty());
-    assert(_execution == nullptr && _executions.empty());
+    assert(_execution == nullptr);
     return true;
   }
   else
@@ -1189,7 +1049,7 @@ bool nnfw_session::isStateModelLoaded()
   {
     assert(_nnpkg != nullptr);
     assert(!_coptions.empty());
-    assert(_execution == nullptr && _executions.empty());
+    assert(_execution == nullptr);
     return true;
   }
   else
@@ -1204,7 +1064,7 @@ bool nnfw_session::isStatePrepared()
   {
     assert(_nnpkg == nullptr);
     assert(!_coptions.empty());
-    assert(_execution != nullptr || !_executions.empty());
+    assert(_execution != nullptr);
     return true;
   }
   else
@@ -1219,7 +1079,7 @@ bool nnfw_session::isStateRunning()
   {
     assert(_nnpkg == nullptr);
     assert(!_coptions.empty());
-    assert(_execution != nullptr || !_executions.empty());
+    assert(_execution != nullptr);
     return true;
   }
   return false;
@@ -1231,7 +1091,7 @@ bool nnfw_session::isStateFinishedRun()
   {
     assert(_nnpkg == nullptr);
     assert(!_coptions.empty());
-    assert(_execution != nullptr || !_executions.empty());
+    assert(_execution != nullptr);
     return true;
   }
   else
diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h
index 9b729fd5f..8e2c2fba6 100644
--- a/runtime/onert/api/src/nnfw_api_internal.h
+++ b/runtime/onert/api/src/nnfw_api_internal.h
@@ -136,9 +136,6 @@ public:
   NNFW_STATUS set_available_backends(const char *backends);
   NNFW_STATUS set_op_backend(const char *op, const char *backend);
 
-  // accessor
-  std::vector<std::shared_ptr<onert::exec::Execution>> *get_executions() { return &_executions; }
-
   //
   // Internal-only API
   //
@@ -151,7 +148,6 @@ public:
   //
   // Experimental API
   //
-  void make_dependency();
   NNFW_STATUS push_pipeline_input(std::vector<void *> *inputs, std::vector<uint32_t> *lengths);
   NNFW_STATUS pop_pipeline_output(std::vector<void *> *outputs);
 
@@ -166,6 +162,9 @@ public:
 
 private:
   const onert::ir::Graph *primary_subgraph();
+  uint32_t getInputSize();
+  uint32_t getOutputSize();
+
   bool isStateInitialized();
   bool isStateModelLoaded();
   bool isStatePrepared();
@@ -181,8 +180,6 @@ private:
   std::unique_ptr<onert::exec::Execution> _execution;
   std::shared_ptr<onert::api::CustomKernelRegistry> _kernel_registry;
   std::vector<std::thread> _threads;
-  std::vector<std::shared_ptr<onert::exec::Execution>> _executions;
-  std::string _package_file_path;
 };
 
 #endif // __API_NNFW_API_INTERNAL_H__
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index e709286df..5b0ec92b7 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -256,7 +256,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 
   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
   std::vector<const ::arm_compute::ICLTensor *> input_tensors;
-  for (auto &ifm_ind : input_indexes)
+  for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 
   std::unique_ptr<::arm_compute::IFunction> fn;
diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h
index e008fd6f5..b0b5ca612 100644
--- a/runtime/onert/backend/acl_common/AclTensorBuilder.h
+++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h
@@ -162,7 +162,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo(
     auto &offset = parent_info.coordinates;
     auto frontend_layout = parent_info.frontend_layout;
 
-    assert(obj.shape().rank() <= ir::Shape::MAX_RANK);
+    assert(obj.shape().rank() <= ir::Shape::kMaxRank);
     auto shape = obj.shape();
     if (_operands.at(parent_index).shape().rank() >= 4 && frontend_layout == ir::Layout::NHWC &&
         backend_layout == ir::Layout::NCHW)
@@ -218,11 +218,11 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::allocate(void)
 {
   auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map);
 
-  for (auto &entry : lifetime_map)
+  for (const auto &entry : lifetime_map)
   {
-    auto &use = entry.second;
-    auto use_type = use.first;
-    auto use_index = use.second;
+    const auto &use = entry.second;
+    const auto &use_type = use.first;
+    const auto &use_index = use.second;
     assert(use_index.valid());
     if (use_type == UsesType::FIRST)
       _tensor_mgr->startLifetime(use_index);
@@ -255,9 +255,9 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
   assert(_tensor_mgr->nonconstTensors().size() == 0);
 
   // Normal tensors
-  for (auto &entry : _tensor_info_map)
+  for (const auto &entry : _tensor_info_map)
   {
-    auto ind = entry.first;
+    const auto &ind = entry.first;
     if (_parent_map.count(ind) > 0)
       continue;
 
@@ -273,9 +273,9 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
   assert(_tensor_mgr->nonconstSubtensors().size() == 0);
   // TODO Iterate `_parent_map` instead, once the optimizer bug is fixed
   //      `Optimizer` iterates the entire Operations, so there is a bug if iterating _parent_map
-  for (auto &entry : _tensor_info_map)
+  for (const auto &entry : _tensor_info_map)
   {
-    auto ind = entry.first;
+    const auto &ind = entry.first;
     if (_parent_map.count(ind) == 0)
       continue;
 
@@ -343,7 +343,7 @@ template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 bool AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::areSubTensorsOf(
   const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq)
 {
-  for (auto &cand : seq)
+  for (const auto &cand : seq)
   {
     if (!isSubTensorOf(parent, cand))
     {
diff --git a/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h
index 7bb72d74e..5536d2780 100644
--- a/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h
+++ b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h
@@ -65,7 +65,7 @@ public:
       .operands()
       .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-    for (auto &it : ret)
+    for (auto &&it : ret)
     {
       auto &fn_seq = it.second;
       fn_seq->iterate([&](exec::IFunction &ifunc) {
diff --git a/runtime/onert/backend/cpu/BackendContext.cc b/runtime/onert/backend/cpu/BackendContext.cc
index e6f7b8470..da48a785d 100644
--- a/runtime/onert/backend/cpu/BackendContext.cc
+++ b/runtime/onert/backend/cpu/BackendContext.cc
@@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels()
     .operands()
     .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-  for (auto &it : ret)
+  for (auto &&it : ret)
   {
     auto &fn_seq = it.second;
     fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 762ee7392..896883bc3 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -279,7 +279,7 @@ void KernelGenerator::visit(const ir::operation::AddN &node)
   const auto output_index{node.getOutputs().at(0)};
 
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &input_idx : node.getInputs())
+  for (const auto &input_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(input_idx));
 
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
@@ -386,7 +386,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &ifm_idx : node.getInputs())
+  for (const auto &ifm_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::ConcatLayer>();
@@ -626,7 +626,7 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
 
   auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &ifm_idx : node.getInputs())
+  for (const auto &ifm_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto equation = node.param().equation;
@@ -643,7 +643,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
                           std::vector<custom::TypeInfo> &types,
                           std::vector<IPortableTensor *> &tensors) {
-    for (auto &idx : opSeq)
+    for (const auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
       // TODO make sure using `_current_layout` is correct for custom operations
@@ -750,7 +750,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
 
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &ifm_idx : node.getInputs())
+  for (const auto &ifm_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   auto fn = std::make_unique<ops::PackLayer>();
@@ -772,7 +772,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   std::vector<IPortableTensor *> output_tensors;
-  for (auto &output_idx : node.getOutputs())
+  for (const auto &output_idx : node.getOutputs())
     output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::UnpackLayer>();
@@ -934,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx);
 
   std::vector<IPortableTensor *> out_tensors;
-  for (auto &output_idx : node.getOutputs())
+  for (const auto &output_idx : node.getOutputs())
     out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitLayer>();
@@ -1261,7 +1261,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 
   auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &ifm_idx : node.getInputs())
+  for (const auto &ifm_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   const auto epsilon = node.param().epsilon;
@@ -1372,7 +1372,7 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
   auto in_split_dim = _tensor_reg->getPortableTensor(split_dim);
 
   std::vector<IPortableTensor *> out_tensors;
-  for (auto &output_idx : node.getOutputs())
+  for (const auto &output_idx : node.getOutputs())
     out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
 
   auto fn = std::make_unique<ops::SplitVLayer>();
diff --git a/runtime/onert/backend/gpu_cl/Backend.h b/runtime/onert/backend/gpu_cl/Backend.h
index d67ba1602..cdf965557 100644
--- a/runtime/onert/backend/gpu_cl/Backend.h
+++ b/runtime/onert/backend/gpu_cl/Backend.h
@@ -28,6 +28,7 @@
 #include "TensorBuilder.h"
 
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace onert
@@ -55,15 +56,16 @@ public:
     {
       return nullptr;
     }
-    auto tm = createTensorManager(&environment->context());
 
-    auto tr = std::make_shared<TensorRegistry>(tm);
-
-    tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info;
-    create_info.precision = tflite::gpu::cl::CalculationsPrecision::F32;
+    tflite::gpu::CreateGpuModelInfo create_info;
+    create_info.precision = tflite::gpu::CalculationsPrecision::F32;
     create_info.storage_type =
       tflite::gpu::cl::GetStorageTypeWithMinimalMemoryConsumption(environment->device().GetInfo());
-    create_info.hints.Add(tflite::gpu::cl::ModelHints::kFastestInference);
+    create_info.hints.Add(tflite::gpu::ModelHints::kFastestInference);
+
+    auto tm = createTensorManager(&environment->context(), create_info, environment);
+
+    auto tr = std::make_shared<TensorRegistry>(tm);
 
     auto cc = std::make_shared<tflite::gpu::cl::CreationContext>();
     cc->device = environment->GetDevicePtr();
@@ -71,7 +73,7 @@ public:
     cc->queue = environment->queue();
     cc->cache = environment->program_cache();
 
-    auto tb = std::make_shared<TensorBuilder>(operands, tm, create_info, environment);
+    auto tb = std::make_shared<TensorBuilder>(operands, tm);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
 
diff --git a/runtime/onert/backend/gpu_cl/BackendContext.cc b/runtime/onert/backend/gpu_cl/BackendContext.cc
index ec9442155..b09319d98 100644
--- a/runtime/onert/backend/gpu_cl/BackendContext.cc
+++ b/runtime/onert/backend/gpu_cl/BackendContext.cc
@@ -86,6 +86,32 @@ ITensorRegistry *BackendContext::genTensors()
   return tensor_registry.get();
 }
 
+FunctionMap BackendContext::genKernels()
+{
+  FunctionMap fn_map;
+
+  for (auto op_ind : _data.op_order)
+  {
+    auto fn_seq = kernel_gen->generate(op_ind);
+    fn_map.emplace_back(op_ind, std::move(fn_seq));
+  }
+
+  kernel_gen->get_operation(fn_map);
+  tensor_builder->allocate();
+  // NOTE For memory optimization, we want to free some operand data
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
+
+  for (auto &&it : fn_map)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return fn_map;
+}
+
 } // namespace gpu_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/BackendContext.h b/runtime/onert/backend/gpu_cl/BackendContext.h
index 7412d2bce..da5daae02 100644
--- a/runtime/onert/backend/gpu_cl/BackendContext.h
+++ b/runtime/onert/backend/gpu_cl/BackendContext.h
@@ -25,6 +25,7 @@
 #include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "TensorBuilder.h"
+
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 
 namespace onert
@@ -52,6 +53,7 @@ public:
   }
 
   ITensorRegistry *genTensors() override;
+  FunctionMap genKernels() override;
 
 protected:
   void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
diff --git a/runtime/onert/backend/gpu_cl/CMakeLists.txt b/runtime/onert/backend/gpu_cl/CMakeLists.txt
index eb1964214..d62dbd84c 100644
--- a/runtime/onert/backend/gpu_cl/CMakeLists.txt
+++ b/runtime/onert/backend/gpu_cl/CMakeLists.txt
@@ -24,7 +24,26 @@ if(NOT Fp16_FOUND)
   return()
 endif(NOT Fp16_FOUND)
 
-nnas_find_package(TensorFlowGpu QUIET)
+nnas_find_package(VulkanSource QUIET)
+if(NOT VulkanSource_FOUND)
+  return()
+endif(NOT VulkanSource_FOUND)
+
+nnas_find_package(Opengl_HeadersSource QUIET)
+if(NOT Opengl_HeadersSource_FOUND)
+  return()
+endif(NOT Opengl_HeadersSource_FOUND)
+
+nnas_find_package(Egl_HeadersSource QUIET)
+if(NOT Egl_HeadersSource_FOUND)
+  return()
+endif(NOT Egl_HeadersSource_FOUND)
+
+if (NOT ${TARGET_OS} MATCHES "tizen")
+  nnas_find_package(FlatBuffers REQUIRED)
+endif ()
+
+nnfw_find_package(TensorFlowGpu QUIET)
 if(NOT TensorFlowGpu_FOUND)
   message(FATAL_ERROR 'TensorFlowGpu lib not found')
   return()
@@ -35,18 +54,32 @@ file(GLOB_RECURSE SOURCES "*.cc")
 add_library(${LIB_ONERT_BACKEND_GPU_CL} SHARED ${SOURCES})
 
 target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${TENSORFLOWGPU_SOURCE_DIR})
+target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${TensorFlowSource_DIR})
+target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${VulkanSource_DIR}/include)
+target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${Opengl_HeadersSource_DIR}/api)
+target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${Egl_HeadersSource_DIR}/api)
+
+if (${TARGET_OS} MATCHES "tizen")
+    target_compile_options(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE "-Wno-error=deprecated-copy")
+endif ()
+
+target_compile_options(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE "-DCL_TARGET_OPENCL_VERSION=220" "-DEGL_NO_X11")
 
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE abseil)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE dl)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE farmhash)
-target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} INTERFACE Open_CL_Headers)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE OpenCL_Headers)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE fp16)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE TensorFlowGpu)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE onert_core)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${LIB_ONERT_BACKEND_CL_COMMON})
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_common)
 target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_coverage)
+if (${TARGET_OS} MATCHES "tizen")
+  target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE flatbuffers)
+else()
+  target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE flatbuffers::flatbuffers)
+endif ()
 
 set_target_properties(${LIB_ONERT_BACKEND_GPU_CL} PROPERTIES OUTPUT_NAME backend_gpu_cl)
 
@@ -55,4 +88,8 @@ if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
                      COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:${LIB_ONERT_BACKEND_GPU_CL}>)
 endif()
 
+add_library(tflite_ignore_warnings INTERFACE)
+target_compile_options(tflite_ignore_warnings INTERFACE -Wno-unused-parameter -Wno-sign-compare)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE tflite_ignore_warnings)
+
 install(TARGETS ${LIB_ONERT_BACKEND_GPU_CL} DESTINATION lib)
diff --git a/runtime/onert/backend/gpu_cl/ClFunction.h b/runtime/onert/backend/gpu_cl/ClFunction.h
index 5e8a11a84..6afbd4910 100644
--- a/runtime/onert/backend/gpu_cl/ClFunction.h
+++ b/runtime/onert/backend/gpu_cl/ClFunction.h
@@ -22,9 +22,9 @@
 #include <vector>
 #include <memory>
 
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
 
 namespace onert
 {
@@ -35,53 +35,51 @@ namespace gpu_cl
 class ClFunction : public ::onert::exec::IFunction
 {
 public:
-  ClFunction() : _gpu_operations(), _creation_context() {}
-
-public:
-  void configure(std::shared_ptr<tflite::gpu::cl::CreationContext> creation_context)
+  ClFunction(std::shared_ptr<tflite::gpu::cl::CreationContext> creation_context)
+    : _creation_context(creation_context), _gpu_operations()
   {
-    _creation_context = creation_context;
   }
 
-  void add_operation(std::unique_ptr<tflite::gpu::cl::GPUOperation> gpu_operation)
+public:
+  void add_operation(tflite::gpu::cl::ClOperation *gpu_operation)
   {
-    _gpu_operations.push_back(std::move(gpu_operation));
+    _gpu_operations.push_back(gpu_operation);
   }
 
   void run() override
   {
-    for (const auto &gpu_operation : _gpu_operations)
+    for (const auto gpu_operation : _gpu_operations)
     {
       if (!gpu_operation->AddToQueue(_creation_context->queue).ok())
       {
         throw std::runtime_error("Failed to AddToQueue.");
       }
-      if (!_creation_context->queue->WaitForCompletion().ok())
-      {
-        throw std::runtime_error("Failed to WaitForCompletion.");
-      }
     }
   }
 
   void prepare() override
   {
-    for (const auto &gpu_operation : _gpu_operations)
+    for (const auto gpu_operation : _gpu_operations)
     {
+      if (!gpu_operation->GetGpuOperation().AssembleCode(_creation_context->GetGpuInfo()).ok())
+      {
+        throw std::runtime_error("Failed to AssembleCode.");
+      }
       if (!gpu_operation->Compile(*_creation_context).ok())
       {
         throw std::runtime_error("Failed to Compile.");
       }
-
       if (!gpu_operation->UpdateParams().ok())
       {
         throw std::runtime_error("Failed to UpdateParams.");
       }
+      gpu_operation->GetGpuOperation().args_.ReleaseCPURepresentation();
     }
   }
 
 private:
-  std::vector<std::unique_ptr<tflite::gpu::cl::GPUOperation>> _gpu_operations;
   std::shared_ptr<tflite::gpu::cl::CreationContext> _creation_context;
+  std::vector<tflite::gpu::cl::ClOperation *> _gpu_operations;
 };
 
 } // namespace gpu_cl
diff --git a/runtime/onert/backend/gpu_cl/Config.h b/runtime/onert/backend/gpu_cl/Config.h
index 6a455bbb5..f8f94aaf4 100644
--- a/runtime/onert/backend/gpu_cl/Config.h
+++ b/runtime/onert/backend/gpu_cl/Config.h
@@ -41,9 +41,6 @@ public:
   bool supportDynamicTensor() override { return false; }
   bool supportFP16() override { return true; }
   std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); }
-
-private:
-  void *_handle = nullptr;
 };
 
 } // namespace gpu_cl
diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.cc b/runtime/onert/backend/gpu_cl/KernelGenerator.cc
index 04edc3928..a24c4f59c 100644
--- a/runtime/onert/backend/gpu_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/gpu_cl/KernelGenerator.cc
@@ -23,10 +23,11 @@
 #include "TensorManager.h"
 
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
-#include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h"
-#include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h"
+#include "tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h"
 
 #include "ir/Operations.h"
 #include "ir/Operations.Include.h"
@@ -38,9 +39,6 @@
 #include "util/logging.h"
 #include "util/Utils.h"
 
-using namespace tflite::gpu;
-using namespace tflite::gpu::cl;
-
 namespace onert
 {
 namespace backend
@@ -48,39 +46,170 @@ namespace backend
 namespace gpu_cl
 {
 
-HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); }
+void KernelGenerator::addClNode(const std::vector<ir::OperandIndex> &inputs,
+                                const std::vector<ir::OperandIndex> &outputs,
+                                std::unique_ptr<tflite::gpu::GPUOperation> gpu_op)
+{
+  tflite::gpu::cl::CLNode cl_node;
+  cl_node.cl_operation.Init(std::move(gpu_op));
+  cl_node.inputs.resize(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i)
+  {
+    cl_node.inputs[i] = inputs[i].value();
+  }
+  cl_node.outputs.resize(outputs.size());
+  for (size_t i = 0; i < outputs.size(); ++i)
+  {
+    cl_node.outputs[i] = outputs[i].value();
+  }
+  _nodes.push_back(std::move(cl_node));
+  _operation_indexes.push_back(_operation_index);
+  return;
+}
+
+void KernelGenerator::get_operation(FunctionMap &Functions)
+{
+  size_t size = _nodes.size();
+  size_t i = 0;
+  for (auto &&it : Functions)
+  {
+    auto index = it.first;
+    auto node_index = _operation_indexes[i];
+    while (index == node_index)
+    {
+      auto &fn_seq = it.second;
+      auto &node = _nodes[i++];
+      for (size_t j = 0; j < node.inputs.size(); ++j)
+      {
+        uint32_t idx = node.inputs[j];
+        node.cl_operation.GetGpuOperation().SetSrc(
+          _tensor_reg->getClTensor(ir::OperandIndex{idx})->handle(), j);
+      }
+      for (size_t j = 0; j < node.outputs.size(); ++j)
+      {
+        uint32_t idx = node.outputs[j];
+        node.cl_operation.GetGpuOperation().SetDst(
+          _tensor_reg->getClTensor(ir::OperandIndex{idx})->handle(), j);
+      }
+      fn_seq->iterate([&](exec::IFunction &ifunc) {
+        static_cast<ClFunction &>(ifunc).add_operation(&node.cl_operation);
+      });
+      if (i == size)
+      {
+        break;
+      }
+      node_index = _operation_indexes[i];
+    }
+    if (i == size)
+    {
+      break;
+    }
+  }
+}
 
-template <typename AttrT>
-void UpdatePadding(const ir::PaddingType type, const BHWC &input_shape, AttrT *attr)
+absl::Status KernelGenerator::readConstTensor(const ir::OperandIndex &index,
+                                              tflite::gpu::TensorOrScalar *param)
 {
-  if (type == ir::PaddingType::SAME)
+  const auto shape = _ctx.at(index).shape();
+  if (shape.rank() == 0 && shape.num_elements() == 1)
   {
-    attr->padding = CalculateSamePadding(input_shape, *attr);
+    tflite::gpu::Tensor<tflite::gpu::Scalar, tflite::gpu::DataType::FLOAT32> tensor;
+    tensor.shape.v = 1;
+    tensor.data.resize(1);
+    std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize());
+    *param = tensor.data[0];
   }
   else
   {
-    attr->padding.prepended = HW(0, 0);
-    attr->padding.appended = HW(0, 0);
+    if (CheckIfLinearConvertible(&shape))
+    {
+      tflite::gpu::Tensor<tflite::gpu::Linear, tflite::gpu::DataType::FLOAT32> tensor;
+      tensor.shape.v = shape.dim(shape.rank() - 1);
+      tensor.data.resize(shape.num_elements());
+      std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize());
+      *param = std::move(tensor);
+    }
+    else
+    {
+      tflite::gpu::Tensor<tflite::gpu::HWC, tflite::gpu::DataType::FLOAT32> tensor;
+      if (shape.rank() == 3)
+      {
+        tensor.shape.h = shape.dim(0);
+        tensor.shape.w = shape.dim(1);
+        tensor.shape.c = shape.dim(2);
+      }
+      else if (shape.rank() == 4)
+      {
+        if (shape.dim(0) != 1)
+        {
+          return absl::UnimplementedError("Batch size is not equal to 1.");
+        }
+        tensor.shape.h = shape.dim(1);
+        tensor.shape.w = shape.dim(2);
+        tensor.shape.c = shape.dim(3);
+      }
+      else
+      {
+        return absl::InvalidArgumentError(
+          "Expected a 3D tensor of shape HxWxC or a 4D tensor of shape 1xHxWxC.");
+      }
+      tensor.data.resize(shape.num_elements());
+      std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize());
+      *param = std::move(tensor);
+    }
   }
+  return absl::OkStatus();
 }
 
-PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir)
+absl::Status KernelGenerator::readConstTensor(
+  const ir::OperandIndex &index,
+  absl::variant<tflite::gpu::Tensor<tflite::gpu::Linear, tflite::gpu::DataType::FLOAT32>,
+                tflite::gpu::Tensor<tflite::gpu::HWC, tflite::gpu::DataType::FLOAT32>> *alpha)
 {
-  switch (type_ir)
+  const auto shape = _ctx.at(index).shape();
+  if (CheckIfLinearConvertible(&shape))
   {
-    case ir::operation::Pool2D::PoolType::AVG:
-      return PoolingType::AVERAGE;
-    case ir::operation::Pool2D::PoolType::MAX:
-      return PoolingType::MAX;
-    default:
-      throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet");
+    tflite::gpu::Tensor<tflite::gpu::Linear, tflite::gpu::DataType::FLOAT32> tensor;
+    tensor.shape.v = shape.dim(shape.rank() - 1);
+    tensor.data.resize(shape.num_elements());
+    std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize());
+    *alpha = std::move(tensor);
   }
+  else
+  {
+    tflite::gpu::Tensor<tflite::gpu::HWC, tflite::gpu::DataType::FLOAT32> tensor;
+    if (shape.rank() == 3)
+    {
+      tensor.shape.h = shape.dim(0);
+      tensor.shape.w = shape.dim(1);
+      tensor.shape.c = shape.dim(2);
+    }
+    else if (shape.rank() == 4)
+    {
+      if (shape.dim(0) != 1)
+      {
+        return absl::UnimplementedError("Batch size is not equal to 1.");
+      }
+      tensor.shape.h = shape.dim(1);
+      tensor.shape.w = shape.dim(2);
+      tensor.shape.c = shape.dim(3);
+    }
+    else
+    {
+      return absl::InvalidArgumentError(
+        "Expected a 3D tensor of shape HxWxC or a 4D tensor of shape 1xHxWxC.");
+    }
+    tensor.data.resize(shape.num_elements());
+    std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize());
+    *alpha = std::move(tensor);
+  }
+  return absl::OkStatus();
 }
 
-KernelGenerator::KernelGenerator(const ir::Graph &graph,
-                                 const std::shared_ptr<TensorBuilder> &tensor_builder,
-                                 const std::shared_ptr<TensorRegistry> &tensor_reg,
-                                 const std::shared_ptr<CreationContext> &creation_context)
+KernelGenerator::KernelGenerator(
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<TensorRegistry> &tensor_reg,
+  const std::shared_ptr<tflite::gpu::cl::CreationContext> &creation_context)
   : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
     _operations_ctx(graph.operations()), _current_layout{graph.layout()},
     _tensor_builder(tensor_builder), _tensor_reg(tensor_reg), _creation_context(creation_context)
@@ -89,13 +218,13 @@ KernelGenerator::KernelGenerator(const ir::Graph &graph,
 
 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  auto ret = std::make_unique<exec::FunctionSequence>();
-  ret->enableDynamicShapeInferer(false);
-
+  auto fn_seq = std::make_unique<exec::FunctionSequence>();
+  fn_seq->enableDynamicShapeInferer(false);
+  _operation_index = ind;
   const auto &op = _graph.operations().at(ind);
   op.accept(*this);
-  ret->append(releaseFunction());
-  return ret;
+  fn_seq->append(releaseFunction());
+  return fn_seq;
 }
 
 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
@@ -104,63 +233,66 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 
-  // const auto activation = node.param().activation;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
+
+  const bool lhs_const = _ctx.at(lhs_index).isConstant();
+  const bool rhs_const = _ctx.at(rhs_index).isConstant();
+
+  if (lhs_const && rhs_const)
+  {
+    throw std::runtime_error("No runtime input tensors for " + node.name());
+  }
+
+  auto fn = std::make_unique<ClFunction>(_creation_context);
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
 
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationType op_type = convertArithmeticType(node.param().arithmetic_type);
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(lhs_index)->descriptor);
-  auto lhs_shape = _tensor_reg->getClTensorReserver(lhs_index)->shape;
+  if (!lhs_const && !rhs_const)
+  {
+    auto lhs_shape = _tensor_reg->getClTensor(lhs_index)->get_info()._shape;
+    auto rhs_shape = _tensor_reg->getClTensor(rhs_index)->get_info()._shape;
+
+    bool swap =
+      (op_type == tflite::gpu::OperationType::MUL) &&
+      (lhs_shape.h <= rhs_shape.h && lhs_shape.w <= rhs_shape.w && lhs_shape.c <= rhs_shape.c);
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(rhs_index)->descriptor);
-  auto rhs_shape = _tensor_reg->getClTensorReserver(rhs_index)->shape;
+    auto first_index = swap ? rhs_index : lhs_index;
+    auto second_index = swap ? lhs_index : rhs_index;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
-  auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape;
+    op_def.src_tensors.push_back(_tensor_reg->getClTensor(first_index)->get_info()._desc);
+    op_def.src_tensors.push_back(_tensor_reg->getClTensor(second_index)->get_info()._desc);
+    op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc);
 
-  auto fn = std::make_unique<ClFunction>();
+    auto second_shape = _tensor_reg->getClTensor(second_index)->get_info()._shape;
 
-  std::unique_ptr<GPUOperation> gpu_op;
-  switch (node.param().arithmetic_type)
+    tflite::gpu::GPUOperation operation = CreateElementwiseTwoInput(op_def, op_type, second_shape);
+    gpu_op = std::make_unique<tflite::gpu::GPUOperation>(std::move(operation));
+
+    addClNode({first_index, second_index}, {ofm_index}, std::move(gpu_op));
+  }
+  else
   {
-    case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
-    {
-      std::vector<int> channels(2);
-      channels[0] = lhs_shape.c;
-      channels[1] = rhs_shape.c;
-      SelectAdd(op_def, channels, out_shape.c, &gpu_op);
-
-      auto ofm_tensor = _tensor_reg->getClTensor(ofm_index);
-      auto lhs_tensor = _tensor_reg->getClTensor(lhs_index);
-      auto rhs_tensor = _tensor_reg->getClTensor(rhs_index);
-      gpu_op->SetSrc(lhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::LHS);
-      gpu_op->SetSrc(rhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::RHS);
-      gpu_op->SetDst(ofm_tensor->handle(), 0);
-
-      fn->configure(_creation_context);
-      fn->add_operation(std::move(gpu_op));
-      break;
-    }
-    case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
-    {
-      // NYI
-      break;
-    }
-    case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
-    {
-      // NYI
-      break;
-    }
-    case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
+    auto non_const_index = rhs_const ? lhs_index : rhs_index;
+    auto const_index = rhs_const ? rhs_index : lhs_index;
+
+    op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc);
+    op_def.src_tensors.push_back(_tensor_reg->getClTensor(non_const_index)->get_info()._desc);
+
+    tflite::gpu::ElementwiseAttributes attr;
+
+    if (!readConstTensor(const_index, &attr.param).ok())
     {
-      // NYI
-      break;
+      throw std::runtime_error("BinaryArithmetic unsupported constant tensor");
     }
-    default:
-      assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
-      break;
-  }
 
+    tflite::gpu::GPUOperation operation =
+      CreateElementwise(_creation_context->GetGpuInfo(), op_def, op_type, attr);
+    gpu_op = absl::make_unique<tflite::gpu::GPUOperation>(std::move(operation));
+
+    addClNode({non_const_index}, {ofm_index}, std::move(gpu_op));
+  }
   _return_fn = std::move(fn);
 }
 
@@ -174,30 +306,30 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
 
   const auto param = node.param();
 
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input)->descriptor);
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(input)->get_info()._desc);
 
-  auto input_shape = _tensor_reg->getClTensorReserver(input)->shape;
-  auto kernel_shape = _tensor_reg->getClTensorReserver(kernel)->shape;
-  auto output_shape = _tensor_reg->getClTensorReserver(output)->shape;
-  auto bias_shape = _tensor_reg->getClTensorReserver(bias)->shape;
+  auto input_shape = _tensor_reg->getClTensor(input)->get_info()._shape;
+  auto kernel_shape = _tensor_reg->getClTensor(kernel)->get_info()._shape;
+  auto output_shape = _tensor_reg->getClTensor(output)->get_info()._shape;
+  auto bias_shape = _tensor_reg->getClTensor(bias)->get_info()._shape;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc);
 
-  ModelHints hints;
-  std::unique_ptr<GPUOperation> gpu_op; // = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
+  tflite::gpu::ModelHints hints;
+  std::unique_ptr<tflite::gpu::GPUOperation>
+    gpu_op; // = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
 
-  auto input_tensor = _tensor_reg->getClTensor(input);
   auto kernel_tensor = _tensor_reg->getClTensor(kernel);
   auto bias_tensor = _tensor_reg->getClTensor(bias);
-  auto output_tensor = _tensor_reg->getClTensor(output);
 
-  Convolution2DAttributes attr;
+  tflite::gpu::Convolution2DAttributes attr;
   attr.strides = ToHW(param.stride.vertical, param.stride.horizontal);
-  attr.dilations = HW(std::max(static_cast<u_int32_t>(1), param.dilation.height_factor),
-                      std::max(static_cast<u_int32_t>(1), param.dilation.width_factor));
+  attr.dilations =
+    tflite::gpu::HW(std::max(static_cast<u_int32_t>(1), param.dilation.height_factor),
+                    std::max(static_cast<u_int32_t>(1), param.dilation.width_factor));
 
   bool is_weight = (_ctx.at(kernel).isConstant() ? true : false);
 
@@ -220,12 +352,14 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
 
   UpdatePadding(param.padding.type, input_shape, &attr);
 
-  gpu_op = SelectConvolution(attr, output_shape, _creation_context->GetDeviceInfo(), op_def, hints);
-  gpu_op->SetSrc(input_tensor->handle(), ir::operation::Conv2D::INPUT);
+  gpu_op = SelectConvolution(attr, output_shape, _creation_context->GetGpuInfo(), op_def, hints);
 
-  auto fn = std::make_unique<ClFunction>();
+  tflite::gpu::cl::CLNode cl_node;
+  cl_node.inputs.resize(1);
+  cl_node.inputs[0] = input.value();
+  cl_node.outputs.resize(1);
 
-  fn->configure(_creation_context);
+  auto fn = std::make_unique<ClFunction>(_creation_context);
 
   const auto activation = node.param().activation;
 
@@ -233,47 +367,43 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   {
     case ir::Activation::NONE:
     {
-      gpu_op->SetDst(output_tensor->handle(), 0);
-      fn->add_operation(std::move(gpu_op));
+      addClNode({input}, {output}, std::move(gpu_op));
       break;
     }
+    case ir::Activation::RELU:
     case ir::Activation::RELU6:
     {
-      std::unique_ptr<GPUOperation> gpu_op_1;
-      OperationDef op_def_1;
-      std::shared_ptr<cl::Tensor> new_tensor = std::make_shared<cl::Tensor>();
-
-      _new_tensors[output] = new_tensor;
-      if (!CreateTensor(*_creation_context->context, output_shape,
-                        _tensor_reg->getClTensorReserver(output)->descriptor, new_tensor.get())
-             .ok())
-      {
-        throw std::runtime_error("Error CreateTensor.");
-      }
+      std::unique_ptr<tflite::gpu::GPUOperation> gpu_op_1;
+      tflite::gpu::OperationDef op_def_1;
+      const auto shape = _ctx.at(output).shape();
+      auto new_ind = _tensor_reg->addNewClTensor(shape);
+
+      addClNode({input}, {new_ind}, std::move(gpu_op));
 
-      gpu_op->SetDst(new_tensor.get(), 0);
-      fn->add_operation(std::move(gpu_op));
-      op_def_1.precision = CalculationsPrecision::F32;
-      op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
-      op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
+      op_def_1.precision = tflite::gpu::CalculationsPrecision::F32;
+      op_def_1.src_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc);
+      op_def_1.dst_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc);
 
-      //   - ReLU6: clip = 6, alpha = 0
-      ReLUAttributes attr_1;
-      attr_1.clip = 6;
+      tflite::gpu::ReLUAttributes attr_1;
+      if (activation == ir::Activation::RELU6)
+      {
+        attr_1.clip = 6;
+      }
+      else
+      {
+        attr_1.clip = 0;
+      }
       attr_1.alpha = 0;
       gpu_op_1 = SelectReLU(attr_1, op_def_1);
 
-      gpu_op_1->SetSrc(new_tensor.get(), 0);
-      gpu_op_1->SetDst(output_tensor->handle(), 0);
-      fn->add_operation(std::move(gpu_op_1));
+      addClNode({new_ind}, {output}, std::move(gpu_op_1));
       break;
     }
     default:
     {
-      throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+      throw std::runtime_error("gpu_cl KernelGenerator : Not supported Conv2D activiation");
     }
   }
-
   _return_fn = std::move(fn);
 }
 
@@ -292,28 +422,23 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 
   const auto multiplier = node.param().multiplier;
 
-  auto ofm_tensor = _tensor_reg->getClTensor(ofm_index);
-  auto ifm_tensor = _tensor_reg->getClTensor(ifm_index);
-  auto ker_tensor = _tensor_reg->getClTensor(ker_index);
-  auto bias_tensor = _tensor_reg->getClTensor(bias_index);
-
   bool is_weight = (_ctx.at(ker_index).isConstant() ? true : false);
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(ifm_index)->descriptor);
-  auto input_shape = _tensor_reg->getClTensorReserver(ifm_index)->shape;
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(ifm_index)->get_info()._desc);
+  auto input_shape = _tensor_reg->getClTensor(ifm_index)->get_info()._shape;
 
-  auto ker_shape = _tensor_reg->getClTensorReserver(ker_index)->shape;
+  auto ker_shape = _tensor_reg->getClTensor(ker_index)->get_info()._shape;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
-  auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape;
-  auto bias_shape = _tensor_reg->getClTensorReserver(bias_index)->shape;
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc);
+  auto out_shape = _tensor_reg->getClTensor(ofm_index)->get_info()._shape;
+  auto bias_shape = _tensor_reg->getClTensor(bias_index)->get_info()._shape;
 
-  DepthwiseConvolution2DAttributes attr;
+  tflite::gpu::DepthwiseConvolution2DAttributes attr;
   attr.strides = ToHW(stride.vertical, stride.horizontal);
-  attr.dilations = HW(std::max(static_cast<u_int32_t>(1), dilation.height_factor),
-                      std::max(static_cast<u_int32_t>(1), dilation.width_factor));
+  attr.dilations = tflite::gpu::HW(std::max(static_cast<u_int32_t>(1), dilation.height_factor),
+                                   std::max(static_cast<u_int32_t>(1), dilation.width_factor));
 
   if (is_weight)
   {
@@ -323,12 +448,14 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
     attr.weights.shape.w = ker_shape.w;
     attr.weights.shape.i = ker_shape.c;
     attr.weights.data.resize(ker_shape.DimensionsProduct());
-    memcpy(attr.weights.data.data(), _ctx.at(ker_index).data()->base(), ker_tensor->total_size());
+    memcpy(attr.weights.data.data(), _ctx.at(ker_index).data()->base(),
+           _ctx.at(ker_index).operandSize());
   }
   attr.bias.id = bias_index.value();
   attr.bias.shape.v = bias_shape.b != 1 ? bias_shape.b : bias_shape.c;
   attr.bias.data.resize(bias_shape.DimensionsProduct());
-  memcpy(attr.bias.data.data(), _ctx.at(bias_index).data()->base(), bias_tensor->total_size());
+  memcpy(attr.bias.data.data(), _ctx.at(bias_index).data()->base(),
+         _ctx.at(bias_index).operandSize());
   UpdatePadding(padding.type, input_shape, &attr);
 
   if (multiplier != 1)
@@ -338,7 +465,7 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
     const int filter_width = ker_shape.w;
     const int output_depth = out_shape.c;
 
-    tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
+    tflite::gpu::Tensor<tflite::gpu::OHWI, tflite::gpu::DataType::FLOAT32> weights;
     weights.id = attr.weights.id;
     weights.shape = tflite::gpu::OHWI(output_depth, filter_height, filter_width, input_depth);
     weights.data.resize(weights.shape.DimensionsProduct());
@@ -356,12 +483,12 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
     attr.weights = std::move(weights);
   }
 
-  auto fn = std::make_unique<ClFunction>();
-  std::unique_ptr<GPUOperation> gpu_op;
+  auto fn = std::make_unique<ClFunction>(_creation_context);
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
 
   if (is_weight)
   {
-    gpu_op = SelectDWConvolution(attr, _creation_context->GetDeviceInfo(), op_def);
+    gpu_op = SelectDWConvolution(attr, _creation_context->GetGpuInfo(), op_def);
   }
   else
   {
@@ -370,57 +497,51 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
       throw std::runtime_error(
         "No support of depthwise runtime weights with channel multiplier != 1");
     }
-    gpu_op = SelectDWConvolutionDynamicWeights(attr, _creation_context->GetDeviceInfo(), op_def);
+    gpu_op = SelectDWConvolutionDynamicWeights(attr, _creation_context->GetGpuInfo(), op_def);
   }
 
-  gpu_op->SetSrc(ifm_tensor->handle(), ir::operation::DepthwiseConv2D::Input::INPUT);
-
-  fn->configure(_creation_context);
-
   const auto activation = node.param().activation;
 
   switch (activation)
   {
     case ir::Activation::NONE:
     {
-      gpu_op->SetDst(ofm_tensor->handle(), 0);
-      fn->add_operation(std::move(gpu_op));
+      addClNode({ifm_index}, {ofm_index}, std::move(gpu_op));
       break;
     }
+    case ir::Activation::RELU:
     case ir::Activation::RELU6:
     {
-      std::unique_ptr<GPUOperation> gpu_op_1;
-      OperationDef op_def_1;
-      std::shared_ptr<cl::Tensor> new_tensor = std::make_shared<cl::Tensor>();
-
-      _new_tensors[ofm_index] = new_tensor;
-      if (!CreateTensor(*_creation_context->context, out_shape,
-                        _tensor_reg->getClTensorReserver(ofm_index)->descriptor, new_tensor.get())
-             .ok())
-      {
-        throw std::runtime_error("Error CreateTensor.");
-      }
+      std::unique_ptr<tflite::gpu::GPUOperation> gpu_op_1;
+      tflite::gpu::OperationDef op_def_1;
+      const auto shape = _ctx.at(ofm_index).shape();
+      auto new_ind = _tensor_reg->addNewClTensor(shape);
+
+      addClNode({ifm_index}, {new_ind}, std::move(gpu_op));
 
-      gpu_op->SetDst(new_tensor.get(), 0);
-      fn->add_operation(std::move(gpu_op));
-      op_def_1.precision = CalculationsPrecision::F32;
-      op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
-      op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
+      op_def_1.precision = tflite::gpu::CalculationsPrecision::F32;
 
-      //   - ReLU6: clip = 6, alpha = 0
-      ReLUAttributes attr_1;
-      attr_1.clip = 6;
+      op_def_1.src_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc);
+      op_def_1.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc);
+
+      tflite::gpu::ReLUAttributes attr_1;
+      if (activation == ir::Activation::RELU6)
+      {
+        attr_1.clip = 6;
+      }
+      else
+      {
+        attr_1.clip = 0;
+      }
       attr_1.alpha = 0;
       gpu_op_1 = SelectReLU(attr_1, op_def_1);
 
-      gpu_op_1->SetSrc(new_tensor.get(), 0);
-      gpu_op_1->SetDst(ofm_tensor->handle(), 0);
-      fn->add_operation(std::move(gpu_op_1));
+      addClNode({new_ind}, {ofm_index}, std::move(gpu_op_1));
       break;
     }
     default:
     {
-      throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+      throw std::runtime_error("gpu_cl KernelGenerator : Not supported DepthwiseConv2D acvivation");
     }
   }
 
@@ -429,26 +550,23 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 
 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 {
-  std::unique_ptr<GPUOperation> gpu_op;
-  auto fn = std::make_unique<ClFunction>();
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
+
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc);
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc);
+
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
+  auto fn = std::make_unique<ClFunction>(_creation_context);
   switch (node.param().op_type)
   {
     case ir::operation::ElementwiseActivation::Type::LEAKY_RELU:
     case ir::operation::ElementwiseActivation::Type::RELU:
     {
-      const auto output_index{node.getOutputs().at(0)};
-      const auto input_index{
-        node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
-
-      OperationDef op_def;
-      op_def.precision = CalculationsPrecision::F32;
-      auto output_tensor = _tensor_reg->getClTensor(output_index);
-      auto input_tensor = _tensor_reg->getClTensor(input_index);
-      op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
-      op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
-
-      ReLUAttributes attr;
+      tflite::gpu::ReLUAttributes attr;
       if (ir::operation::ElementwiseActivation::Type::LEAKY_RELU == node.param().op_type)
       {
         attr.alpha = node.param().alpha;
@@ -460,17 +578,33 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
         attr.clip = node.param().alpha;
       }
       gpu_op = SelectReLU(attr, op_def);
-      gpu_op->SetSrc(input_tensor->handle(), ir::operation::ElementwiseActivation::Input::INPUT);
-      gpu_op->SetDst(output_tensor->handle(), 0);
-      fn->configure(_creation_context);
-      fn->add_operation(std::move(gpu_op));
-
-      _return_fn = std::move(fn);
+      break;
+    }
+    case ir::operation::ElementwiseActivation::Type::LOGISTIC:
+    {
+      if (_ctx.at(input_index).typeInfo().type() != ir::DataType::FLOAT32)
+      {
+        throw std::runtime_error{"Unsupported data type of LOGISTIC"};
+      }
+      tflite::gpu::GPUOperation operation =
+        CreateElementwiseOneInput(_creation_context->GetGpuInfo(), op_def,
+                                  convertElementwiseActivationType(node.param().op_type));
+      gpu_op = std::make_unique<tflite::gpu::GPUOperation>(std::move(operation));
+      break;
+    }
+    case ir::operation::ElementwiseActivation::Type::TANH:
+    {
+      tflite::gpu::GPUOperation operation = CreateElementwiseOneInput(
+        _creation_context->GetGpuInfo(), op_def, tflite::gpu::OperationType::TANH);
+      gpu_op = std::make_unique<tflite::gpu::GPUOperation>(std::move(operation));
       break;
     }
     default:
-      throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+      throw std::runtime_error(
+        "gpu_cl KernelGenerator : Not supported operation on ElementwiseActivation");
   }
+  addClNode({input_index}, {output_index}, std::move(gpu_op));
+  _return_fn = std::move(fn);
 }
 
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
@@ -478,24 +612,24 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
 
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
-  auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc);
+  auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc);
 
   const auto kh = node.param().kh;
   const auto kw = node.param().kw;
   const auto stride = node.param().stride;
   const auto op_type = convertPoolType(node.param().op_type);
 
-  Pooling2DAttributes attributes;
+  tflite::gpu::Pooling2DAttributes attributes;
   attributes.type = op_type;
-  attributes.kernel = HW(kh > 0 ? kh : 1, kw > 0 ? kw : 1);
-  attributes.strides =
-    HW(stride.vertical > 0 ? stride.vertical : 1, stride.horizontal > 0 ? stride.horizontal : 1);
+  attributes.kernel = tflite::gpu::HW(kh > 0 ? kh : 1, kw > 0 ? kw : 1);
+  attributes.strides = tflite::gpu::HW(stride.vertical > 0 ? stride.vertical : 1,
+                                       stride.horizontal > 0 ? stride.horizontal : 1);
 
   if (node.param().padding.type == ir::PaddingType::SAME)
   {
@@ -503,23 +637,15 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
   }
   else
   {
-    attributes.padding.prepended = HW(0, 0);
-    attributes.padding.appended = HW(0, 0);
+    attributes.padding.prepended = tflite::gpu::HW(0, 0);
+    attributes.padding.appended = tflite::gpu::HW(0, 0);
   }
 
-  auto fn = std::make_unique<ClFunction>();
-  std::unique_ptr<GPUOperation> gpu_op;
+  auto fn = std::make_unique<ClFunction>(_creation_context);
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
   gpu_op = SelectPooling(attributes, op_def);
 
-  auto input_tensor = _tensor_reg->getClTensor(input_index);
-  auto output_tensor = _tensor_reg->getClTensor(output_index);
-
-  gpu_op->SetSrc(input_tensor->handle(), ir::operation::Pool2D::Input::INPUT);
-  gpu_op->SetDst(output_tensor->handle(), 0);
-
-  fn->configure(_creation_context);
-  fn->add_operation(std::move(gpu_op));
-
+  addClNode({input_index}, {output_index}, std::move(gpu_op));
   _return_fn = std::move(fn);
 }
 
@@ -528,31 +654,24 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
-  auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc);
+  auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
-  auto output_shape = _tensor_reg->getClTensorReserver(output_index)->shape;
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc);
+  auto output_shape = _tensor_reg->getClTensor(output_index)->get_info()._shape;
 
-  ReshapeAttributes attr;
+  tflite::gpu::ReshapeAttributes attr;
   attr.new_shape = output_shape;
 
-  auto fn = std::make_unique<ClFunction>();
-  std::unique_ptr<GPUOperation> gpu_op;
+  auto fn = std::make_unique<ClFunction>(_creation_context);
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
   const int src_channels = input_shape.c;
   SelectReshape(src_channels, attr.new_shape.c, op_def, &gpu_op);
 
-  auto input_tensor = _tensor_reg->getClTensor(input_index);
-  auto output_tensor = _tensor_reg->getClTensor(output_index);
-  gpu_op->SetSrc(input_tensor->handle(), ir::operation::Reshape::Input::INPUT);
-  gpu_op->SetDst(output_tensor->handle(), 0);
-
-  fn->configure(_creation_context);
-  fn->add_operation(std::move(gpu_op));
-
+  addClNode({input_index}, {output_index}, std::move(gpu_op));
   _return_fn = std::move(fn);
 }
 
@@ -568,27 +687,20 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
     throw std::runtime_error("Softmax.beta != 1 is not supported in gpu_cl");
   }
 
-  OperationDef op_def;
-  op_def.precision = CalculationsPrecision::F32;
+  tflite::gpu::OperationDef op_def;
+  op_def.precision = tflite::gpu::CalculationsPrecision::F32;
 
-  op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+  op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc);
 
-  op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
-  auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+  op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc);
+  auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape;
 
-  auto fn = std::make_unique<ClFunction>();
+  auto fn = std::make_unique<ClFunction>(_creation_context);
 
-  std::unique_ptr<GPUOperation> gpu_op;
+  std::unique_ptr<tflite::gpu::GPUOperation> gpu_op;
   SelectSoftmax(input_shape, op_def, &gpu_op);
-  auto output_tensor = _tensor_reg->getClTensor(output_index);
-  auto input_tensor = _tensor_reg->getClTensor(input_index);
-
-  gpu_op->SetSrc(input_tensor->handle(), ir::operation::Softmax::Input::INPUT);
-  gpu_op->SetDst(output_tensor->handle(), 0);
-
-  fn->configure(_creation_context);
-  fn->add_operation(std::move(gpu_op));
 
+  addClNode({input_index}, {output_index}, std::move(gpu_op));
   _return_fn = std::move(fn);
 }
 
diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.h b/runtime/onert/backend/gpu_cl/KernelGenerator.h
index 91fd3cd9d..5e8c2621f 100644
--- a/runtime/onert/backend/gpu_cl/KernelGenerator.h
+++ b/runtime/onert/backend/gpu_cl/KernelGenerator.h
@@ -26,6 +26,7 @@
 
 #include <backend/CustomKernelBuilder.h>
 #include <backend/basic/KernelGeneratorBase.h>
+#include <backend/BackendContext.h>
 #include <ir/Operands.h>
 #include <ir/Operations.h>
 #include <ir/Operations.Include.h>
@@ -46,6 +47,8 @@ public:
 
   std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
 
+  void get_operation(FunctionMap &Functions);
+
 private:
   void visit(const ir::operation::BinaryArithmetic &) override;
   void visit(const ir::operation::Conv2D &) override;
@@ -54,6 +57,14 @@ private:
   void visit(const ir::operation::Pool2D &) override;
   void visit(const ir::operation::Reshape &) override;
   void visit(const ir::operation::Softmax &) override;
+  absl::Status readConstTensor(const ir::OperandIndex &index, tflite::gpu::TensorOrScalar *param);
+  absl::Status readConstTensor(
+    const ir::OperandIndex &index,
+    absl::variant<tflite::gpu::Tensor<tflite::gpu::Linear, tflite::gpu::DataType::FLOAT32>,
+                  tflite::gpu::Tensor<tflite::gpu::HWC, tflite::gpu::DataType::FLOAT32>> *alpha);
+  void addClNode(const std::vector<ir::OperandIndex> &inputs,
+                 const std::vector<ir::OperandIndex> &outputs,
+                 std::unique_ptr<tflite::gpu::GPUOperation> gpu_op);
 
 private:
   const ir::Operands &_ctx;
@@ -62,7 +73,9 @@ private:
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<TensorRegistry> _tensor_reg;
   std::shared_ptr<tflite::gpu::cl::CreationContext> _creation_context;
-  ir::OperandIndexMap<std::shared_ptr<tflite::gpu::cl::Tensor>> _new_tensors;
+  std::vector<tflite::gpu::cl::CLNode> _nodes;
+  ir::OperationIndex _operation_index;
+  std::vector<ir::OperationIndex> _operation_indexes;
 };
 
 } // namespace gpu_cl
diff --git a/runtime/onert/backend/gpu_cl/MemoryManager.h b/runtime/onert/backend/gpu_cl/MemoryManager.h
index a3b9b39de..4b34c39b9 100644
--- a/runtime/onert/backend/gpu_cl/MemoryManager.h
+++ b/runtime/onert/backend/gpu_cl/MemoryManager.h
@@ -17,17 +17,18 @@
 #ifndef __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__
 #define __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__
 
-#include "ex/InferenceContextEx.h"
 #include "operand/CLTensor.h"
 
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "util/logging.h"
 
+#include "tensorflow/lite/delegates/gpu/spi.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h"
 
 #include <cassert>
 
@@ -41,24 +42,31 @@ namespace gpu_cl
 class MemoryManager
 {
 public:
-  MemoryManager(tflite::gpu::cl::CLContext *context) : _context{context} {}
+  MemoryManager(tflite::gpu::cl::CLContext *context, tflite::gpu::CreateGpuModelInfo create_info,
+                const std::shared_ptr<tflite::gpu::cl::Environment> &environment)
+    : _context{context}, _create_info{create_info}, _environment{environment}
+  {
+  }
 
   ~MemoryManager() = default;
 
   void allocate(void)
   {
+    std::unique_ptr<tflite::gpu::TensorObjectConverterBuilder> converter_builder =
+      NewConverterBuilder(_environment.get());
     for (const auto &tensor_entry : _tensors)
     {
       auto tensor = tensor_entry.second;
       auto type = tensor->get_type();
 
-      // if (type == TensorType::TENSOR_TYPE_DELETE) {
-      //   continue;
-      // }
+      if (type == TensorType::TENSOR_TYPE_DELETE)
+      {
+        continue;
+      }
+
+      const auto &shape = tensor->get_info()._shape;
+      const auto &descriptor = tensor->get_info()._desc;
 
-      const auto &t = tensor_reserver_.Get(tensor_entry.first.value());
-      const auto &shape = t->shape;
-      const auto &descriptor = t->descriptor;
       if (!CreateTensor(*_context, shape, descriptor, tensor->handle()).ok())
       {
         std::runtime_error("Failed to CreateTensor");
@@ -66,10 +74,10 @@ public:
       switch (type)
       {
         case TensorType::TENSOR_TYPE_INPUT:
-          tensor->writeConvertInit();
+          tensor->writeConvertInit(converter_builder.get(), _environment);
           break;
         case TensorType::TENSOR_TYPE_OUTPUT:
-          tensor->readConvertInit();
+          tensor->readConvertInit(converter_builder.get(), _environment);
           break;
         default:
           break;
@@ -89,65 +97,60 @@ public:
   { /* DO NOTHING */
   }
 
-  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                   tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info,
-                   std::shared_ptr<tflite::gpu::cl::Environment> environment,
-                   tflite::gpu::cl::DeviceInfo &device_info, TensorType type)
+  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, TensorType type)
   {
-    tflite::gpu::ValueId max_id = 0;
-    auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
-    const auto shape = info.shape();
+    auto data_type = DeduceDataTypeFromPrecision(_create_info.precision);
 
-    auto tensor = std::make_shared<operand::CLTensor>(shape.rank(), shape, environment, type);
-    _tensors[ind] = tensor;
-    tflite::gpu::BHWC t_shape;
-    switch (shape.rank())
+    tflite::gpu::BHWC BHWC_shape = ToBHWC(info.shape());
+
+    tflite::gpu::TensorStorageType storage_type = _create_info.storage_type;
+    tflite::gpu::Layout layout =
+      BHWC_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC;
+
+    if (!SelectBestStorageType(_environment->device().GetInfo(), BHWC_shape, storage_type,
+                               data_type, layout, &storage_type)
+           .ok())
     {
-      case 1:
-        // B layout
-        t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, 1);
-        break;
-      case 2:
-        // BC layout
-        t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, shape.dim(1));
-        break;
-      case 3:
-        // BWC layout
-        t_shape = tflite::gpu::BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2));
-        break;
-      case 4:
-        // BHWC layout
-        t_shape = tflite::gpu::BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3));
-        break;
-      default:
-        break;
+      throw std::runtime_error("Failed to SelectBestStorageType");
     }
+    auto tensor = std::make_shared<operand::CLTensor>(
+      info.shape().rank(), type, BHWC_shape,
+      tflite::gpu::TensorDescriptor{data_type, storage_type, layout});
+    _tensors[ind] = tensor;
+  }
 
-    tflite::gpu::cl::TensorStorageType storage_type = create_info.storage_type;
-    tflite::gpu::Layout layout =
-      t_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC;
+  ir::OperandIndex addTensor(const ir::Shape &shape)
+  {
+    auto data_type = DeduceDataTypeFromPrecision(_create_info.precision);
 
-    tflite::gpu::ValueId id = ind.value();
-    storage_type =
-      tflite::gpu::cl::SelectBestStorageType(device_info, t_shape, storage_type, data_type, layout);
-    auto dummy = std::make_shared<InferenceContextEx::DummyTensor>();
-    dummy->shape = t_shape;
-    dummy->descriptor = tflite::gpu::cl::TensorDescriptor{data_type, storage_type, layout};
-    tensor_reserver_.Add(id, dummy);
+    tflite::gpu::BHWC BHWC_shape = ToBHWC(shape);
 
-    max_id = std::max(max_id, id);
+    tflite::gpu::TensorStorageType storage_type = _create_info.storage_type;
+    tflite::gpu::Layout layout =
+      BHWC_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC;
 
-    tensor_reserver_.SetNext(max_id + 1);
+    if (!SelectBestStorageType(_environment->device().GetInfo(), BHWC_shape, storage_type,
+                               data_type, layout, &storage_type)
+           .ok())
+    {
+      throw std::runtime_error("Failed to SelectBestStorageType");
+    }
+    auto ind = ir::OperandIndex(_new_id--);
+    auto tensor = std::make_shared<operand::CLTensor>(
+      shape.rank(), TensorType::TENSOR_TYPE_VALID, BHWC_shape,
+      tflite::gpu::TensorDescriptor{data_type, storage_type, layout});
+    _tensors[ind] = tensor;
+    return ind;
   }
 
   ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &tensors(void) { return _tensors; }
 
-  InferenceContextEx::TensorReserverEx &tensorReservers(void) { return tensor_reserver_; }
-
 private:
   ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> _tensors;
-  InferenceContextEx::TensorReserverEx tensor_reserver_;
   tflite::gpu::cl::CLContext *_context;
+  tflite::gpu::CreateGpuModelInfo _create_info;
+  std::shared_ptr<tflite::gpu::cl::Environment> _environment;
+  uint32_t _new_id = UINT32_MAX;
 };
 
 } // namespace gpu_cl
diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.cc b/runtime/onert/backend/gpu_cl/TensorBuilder.cc
index e71733427..318335471 100644
--- a/runtime/onert/backend/gpu_cl/TensorBuilder.cc
+++ b/runtime/onert/backend/gpu_cl/TensorBuilder.cc
@@ -21,7 +21,6 @@
 
 #include "TensorManager.h"
 
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
@@ -45,11 +44,8 @@ namespace gpu_cl
 
 using UsesType = cl_common::UsesType;
 
-TensorBuilder::TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr,
-                             tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info,
-                             const std::shared_ptr<tflite::gpu::cl::Environment> &environment)
-  : _operands{operands}, _tensor_mgr{tensor_mgr}, _create_info{create_info}, _environment{
-                                                                               environment}
+TensorBuilder::TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr)
+  : _operands{operands}, _tensor_mgr{tensor_mgr}
 {
   assert(_tensor_mgr);
 }
@@ -89,9 +85,9 @@ void TensorBuilder::allocate(void)
 {
   auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map);
 
-  for (auto &entry : lifetime_map)
+  for (const auto &entry : lifetime_map)
   {
-    auto &use = entry.second;
+    const auto &use = entry.second;
     auto use_type = use.first;
     auto use_index = use.second;
     assert(use_index.valid());
@@ -118,18 +114,22 @@ void TensorBuilder::buildTensors(void)
   assert(_tensor_mgr->constTensors().size() == 0);
   assert(_tensor_mgr->nonconstTensors().size() == 0);
   // Normal tensors
-  for (auto &entry : _tensor_info_map)
+  for (const auto &entry : _tensor_info_map)
   {
-    auto ind = entry.first;
+    const auto &ind = entry.first;
     if (_parent_map.count(ind) > 0)
       continue;
     auto type = _tensor_type_map.at(ind);
     const auto &info = entry.second;
-    _tensor_mgr->buildTensor(ind, info, _create_info, _environment, _environment->device().info_,
-                             type);
+    _tensor_mgr->buildTensor(ind, info, type);
   }
 }
 
+ir::OperandIndex TensorBuilder::addTensor(const ir::Shape &shape)
+{
+  return _tensor_mgr->addTensor(shape);
+}
+
 } // namespace gpu_cl
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.h b/runtime/onert/backend/gpu_cl/TensorBuilder.h
index 2a5cb8b5e..e0333fef5 100644
--- a/runtime/onert/backend/gpu_cl/TensorBuilder.h
+++ b/runtime/onert/backend/gpu_cl/TensorBuilder.h
@@ -34,9 +34,7 @@ namespace gpu_cl
 class TensorBuilder
 {
 public:
-  TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr,
-                tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info,
-                const std::shared_ptr<tflite::gpu::cl::Environment> &environment);
+  TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr);
 
   /**
    * @brief     Register tensor information to allocate on ACL-CL backend
@@ -83,6 +81,7 @@ public:
 private:
   void buildTensors(void);
   ir::OperandIndex findRootParent(ir::OperandIndex index);
+  ir::OperandIndex addTensor(const ir::Shape &shape);
 
 private:
   const ir::Operands &_operands;
@@ -92,8 +91,6 @@ private:
   ir::OperandIndexMap<size_t> _uses_count_map;
 
   std::unique_ptr<TensorManager> _tensor_mgr;
-  tflite::gpu::cl::InferenceContext::CreateInferenceInfo _create_info;
-  std::shared_ptr<tflite::gpu::cl::Environment> _environment;
 
   // for linear executor
   cl_common::LifetimeSeq _lifetime_seq;
diff --git a/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h b/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h
deleted file mode 100644
index 7290ff5da..000000000
--- a/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
-#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
-
-#include "absl/status/status.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-
-namespace onert
-{
-namespace backend
-{
-namespace gpu_cl
-{
-
-enum TensorType
-{
-  TENSOR_TYPE_VALID = 0,
-  TENSOR_TYPE_INPUT = 1,
-  TENSOR_TYPE_OUTPUT = 2,
-  TENSOR_TYPE_DELETE = 3
-};
-
-absl::Status ExtractAxisFromIndex(int dims, int index, tflite::gpu::Axis *axis);
-
-} // namespace gpu_cl
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
diff --git a/runtime/onert/backend/gpu_cl/TensorManager.cc b/runtime/onert/backend/gpu_cl/TensorManager.cc
index 9fe0605ac..02e26ed91 100644
--- a/runtime/onert/backend/gpu_cl/TensorManager.cc
+++ b/runtime/onert/backend/gpu_cl/TensorManager.cc
@@ -42,23 +42,28 @@ void TensorManager::deallocateConsts(void) { _const_mgr->deallocate(); }
 void TensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
 
 void TensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                                tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info,
-                                std::shared_ptr<tflite::gpu::cl::Environment> environment,
-                                tflite::gpu::cl::DeviceInfo &device_info, TensorType type)
+                                TensorType type)
 {
   assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end());
 
   if (info.isConstant())
   {
-    _const_mgr->buildTensor(ind, info, create_info, environment, device_info, type);
+    _const_mgr->buildTensor(ind, info, type);
     _ind_to_mgr.insert({ind, *_const_mgr});
   }
   else
   {
-    _nonconst_mgr->buildTensor(ind, info, create_info, environment, device_info, type);
+    _nonconst_mgr->buildTensor(ind, info, type);
     _ind_to_mgr.insert({ind, *_nonconst_mgr});
   }
 }
+ir::OperandIndex TensorManager::addTensor(const ir::Shape &shape)
+{
+  auto ind = _nonconst_mgr->addTensor(shape);
+  _ind_to_mgr.insert({ind, *_nonconst_mgr});
+
+  return ind;
+}
 
 void TensorManager::startLifetime(const ir::OperandIndex &ind)
 {
@@ -96,29 +101,6 @@ ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &TensorManager::nonconst
   return _nonconst_mgr->tensors();
 }
 
-std::shared_ptr<InferenceContextEx::DummyTensor> TensorManager::atR(const ir::OperandIndex &ind)
-{
-  if (_nonconst_mgr->tensorReservers().HaveTensor(ind.value()))
-  {
-    return _nonconst_mgr->tensorReservers().Get(ind.value());
-  }
-  else if (_const_mgr->tensorReservers().HaveTensor(ind.value()))
-  {
-    return _const_mgr->tensorReservers().Get(ind.value());
-  }
-  return nullptr;
-}
-
-InferenceContextEx::TensorReserverEx &TensorManager::constTensorReservers(void)
-{
-  return _const_mgr->tensorReservers();
-}
-
-InferenceContextEx::TensorReserverEx &TensorManager::nonconstTensorReservers(void)
-{
-  return _nonconst_mgr->tensorReservers();
-}
-
 void TensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
 {
   for (auto it : _nonconst_mgr->tensors())
diff --git a/runtime/onert/backend/gpu_cl/TensorManager.h b/runtime/onert/backend/gpu_cl/TensorManager.h
index 52abc579a..5b09ac130 100644
--- a/runtime/onert/backend/gpu_cl/TensorManager.h
+++ b/runtime/onert/backend/gpu_cl/TensorManager.h
@@ -19,8 +19,10 @@
 
 #include "MemoryManager.h"
 
+#include "Utils.h"
+
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
 
 #include "ir/OperandInfo.h"
 #include "ir/OperandIndexMap.h"
@@ -44,10 +46,8 @@ public:
   void deallocateConsts(void);
   void deallocateNonconsts(void);
 
-  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                   tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info,
-                   std::shared_ptr<tflite::gpu::cl::Environment> environment,
-                   tflite::gpu::cl::DeviceInfo &device_info, TensorType type);
+  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, TensorType type);
+  ir::OperandIndex addTensor(const ir::Shape &shape);
 
   std::shared_ptr<operand::ICLTensor> findTensorAsParent(const ir::OperandIndex &ind);
 
@@ -55,10 +55,6 @@ public:
   void finishLifetime(const ir::OperandIndex &ind);
 
   std::shared_ptr<operand::ICLTensor> at(const ir::OperandIndex &ind);
-  std::shared_ptr<InferenceContextEx::DummyTensor> atR(const ir::OperandIndex &ind);
-
-  InferenceContextEx::TensorReserverEx &constTensorReservers(void);
-  InferenceContextEx::TensorReserverEx &nonconstTensorReservers(void);
 
   ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &constTensors(void);
   ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &nonconstTensors(void);
@@ -73,10 +69,14 @@ private:
   ir::OperandIndexMap<MemoryManager &> _ind_to_mgr;
 };
 
-inline TensorManager *createTensorManager(tflite::gpu::cl::CLContext *context)
+inline TensorManager *
+createTensorManager(tflite::gpu::cl::CLContext *context,
+                    tflite::gpu::CreateGpuModelInfo create_info,
+                    const std::shared_ptr<tflite::gpu::cl::Environment> &environment)
 {
   VERBOSE(createTensorManager) << "GPU-CL TensorManager" << std::endl;
-  return new TensorManager(new MemoryManager(context), new MemoryManager(context));
+  return new TensorManager(new MemoryManager(context, create_info, environment),
+                           new MemoryManager(context, create_info, environment));
 }
 
 } // namespace gpu_cl
diff --git a/runtime/onert/backend/gpu_cl/TensorRegistry.h b/runtime/onert/backend/gpu_cl/TensorRegistry.h
index 6f17aff54..be342e9cb 100644
--- a/runtime/onert/backend/gpu_cl/TensorRegistry.h
+++ b/runtime/onert/backend/gpu_cl/TensorRegistry.h
@@ -44,7 +44,7 @@ public:
 
   auto getClTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); }
 
-  auto getClTensorReserver(const ir::OperandIndex &ind) { return _tensor_mgr->atR(ind); }
+  ir::OperandIndex addNewClTensor(const ir::Shape &shape) { return _tensor_mgr->addTensor(shape); }
 
 private:
   TensorManager *_tensor_mgr;
diff --git a/runtime/onert/backend/gpu_cl/Utils.h b/runtime/onert/backend/gpu_cl/Utils.h
new file mode 100644
index 000000000..1953c0e43
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/Utils.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
+#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
+
+#include "absl/status/status.h"
+
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+#include "ir/operation/BinaryArithmetic.h"
+#include "ir/operation/ElementwiseActivation.h"
+#include "ir/operation/ElementwiseBinary.h"
+#include "ir/operation/ElementwiseUnary.h"
+#include "ir/operation/Pool2D.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+inline tflite::gpu::HW ToHW(int32_t h, int32_t w)
+{
+  return tflite::gpu::HW(h > 0 ? h : 1, w > 0 ? w : 1);
+}
+
+template <typename AttrT>
+inline void UpdatePadding(const ir::PaddingType type, const tflite::gpu::BHWC &input_shape,
+                          AttrT *attr)
+{
+  if (type == ir::PaddingType::SAME)
+  {
+    attr->padding = CalculateSamePadding(input_shape, *attr);
+  }
+  else
+  {
+    attr->padding.prepended = tflite::gpu::HW(0, 0);
+    attr->padding.appended = tflite::gpu::HW(0, 0);
+  }
+}
+
+inline tflite::gpu::PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir)
+{
+  switch (type_ir)
+  {
+    case ir::operation::Pool2D::PoolType::AVG:
+      return tflite::gpu::PoolingType::AVERAGE;
+    case ir::operation::Pool2D::PoolType::MAX:
+      return tflite::gpu::PoolingType::MAX;
+    default:
+      throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet");
+  }
+}
+
+inline tflite::gpu::BHWC ToBHWC(ir::Shape shape)
+{
+  switch (shape.rank())
+  {
+    case 1:
+      // B layout
+      return tflite::gpu::BHWC(shape.dim(0), 1, 1, 1);
+      break;
+    case 2:
+      // BC layout
+      return tflite::gpu::BHWC(shape.dim(0), 1, 1, shape.dim(1));
+      break;
+    case 3:
+      // BWC layout
+      return tflite::gpu::BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2));
+      break;
+    case 4:
+      // BHWC layout
+      return tflite::gpu::BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3));
+      break;
+    default:
+      break;
+  }
+  return tflite::gpu::BHWC();
+}
+
+inline bool CheckIfLinearConvertible(const ir::Shape *shape)
+{
+  if (shape->num_elements() <= 0)
+  {
+    return false;
+  }
+  for (int i = 0; i < shape->rank() - 1; ++i)
+  {
+    if (shape->dim(i) != 1)
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline tflite::gpu::OperationType
+convertArithmeticType(ir::operation::BinaryArithmetic::ArithmeticType arithmetic_type_ir)
+{
+  switch (arithmetic_type_ir)
+  {
+    case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
+      return tflite::gpu::OperationType::ADD;
+    case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
+      return tflite::gpu::OperationType::SUB;
+    case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
+      return tflite::gpu::OperationType::MUL;
+    case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
+      return tflite::gpu::OperationType::DIV;
+    default:
+      throw std::runtime_error("Unsupported ArithmeticType");
+  }
+}
+
+inline tflite::gpu::OperationType
+convertElementwiseActivationType(ir::operation::ElementwiseActivation::Type type_ir)
+{
+  switch (type_ir)
+  {
+    case ir::operation::ElementwiseActivation::Type::LOGISTIC:
+      return tflite::gpu::OperationType::SIGMOID;
+    default:
+      throw std::runtime_error("Unsupported ElementwiseActivationType");
+  }
+}
+
+enum TensorType
+{
+  TENSOR_TYPE_VALID = 0,
+  TENSOR_TYPE_INPUT = 1,
+  TENSOR_TYPE_OUTPUT = 2,
+  TENSOR_TYPE_DELETE = 3
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__
diff --git a/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h b/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h
deleted file mode 100644
index f67387904..000000000
--- a/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__
-#define __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__
-
-#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
-#include "tensorflow/lite/delegates/gpu/common/model.h"
-#include "absl/strings/str_cat.h"
-
-namespace onert
-{
-namespace backend
-{
-namespace gpu_cl
-{
-
-class InferenceContextEx : public tflite::gpu::cl::InferenceContext
-{
-public:
-  struct DummyTensor
-  {
-    tflite::gpu::BHWC shape;
-    tflite::gpu::cl::TensorDescriptor descriptor;
-
-    bool operator==(const DummyTensor &b) const
-    {
-      return shape == b.shape && descriptor == b.descriptor;
-    }
-  };
-
-  class TensorReserverEx
-  {
-  public:
-    tflite::gpu::ValueId Add(const std::shared_ptr<DummyTensor> &dummy)
-    {
-      reservations_[next_] = dummy;
-      return next_++;
-    }
-    void Add(tflite::gpu::ValueId id, const std::shared_ptr<DummyTensor> &dummy)
-    {
-      reservations_[id] = dummy;
-    }
-    void SetNext(tflite::gpu::ValueId id) { next_ = id; }
-    bool HaveTensor(tflite::gpu::ValueId id)
-    {
-      return reservations_.find(id) != reservations_.end();
-    }
-    std::shared_ptr<DummyTensor> Get(tflite::gpu::ValueId id) { return reservations_[id]; }
-
-    std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>>
-    GetTensorDescs() const
-    {
-      std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>> result;
-      for (auto &v : reservations_)
-      {
-        tflite::gpu::cl::TensorDescriptor desc = v.second->descriptor;
-        desc.shape.b = v.second->shape.b;
-        desc.shape.h = v.second->shape.h;
-        desc.shape.w = v.second->shape.w;
-        desc.shape.d = 1;
-        desc.shape.c = v.second->shape.c;
-        result.push_back({v.first, desc});
-      }
-      return result;
-    }
-
-    void Add(const std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>>
-               &tensors)
-    {
-      for (auto &v : tensors)
-      {
-        auto dummy = std::make_shared<DummyTensor>();
-        dummy->descriptor = v.second;
-        dummy->shape.b = v.second.shape.b;
-        dummy->shape.h = v.second.shape.h;
-        dummy->shape.w = v.second.shape.w;
-        dummy->shape.c = v.second.shape.c;
-        Add(v.first, dummy);
-      }
-    }
-
-  private:
-    // absl::flat_hash_map<ValueId, DummyTensor> reservations_;
-    std::unordered_map<tflite::gpu::ValueId, std::shared_ptr<DummyTensor>> reservations_;
-    tflite::gpu::ValueId next_ = 0;
-  };
-};
-
-} // namespace gpu_cl
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__
diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc
index d3ed102a1..1b19b10f8 100644
--- a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc
+++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc
@@ -19,7 +19,7 @@
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
 
 using namespace tflite::gpu::cl;
 
@@ -32,9 +32,9 @@ namespace gpu_cl
 namespace operand
 {
 
-CLTensor::CLTensor(size_t rank, ir::Shape shape,
-                   std::shared_ptr<tflite::gpu::cl::Environment> environment, TensorType type)
-  : ICLTensor{rank, shape, environment, type}, _tensor(std::make_shared<Tensor>())
+CLTensor::CLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape,
+                   tflite::gpu::TensorDescriptor desc)
+  : ICLTensor{rank, type, shape, desc}, _tensor(std::make_shared<Tensor>())
 {
 }
 
diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.h b/runtime/onert/backend/gpu_cl/operand/CLTensor.h
index f2153f430..269551d0c 100644
--- a/runtime/onert/backend/gpu_cl/operand/CLTensor.h
+++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.h
@@ -38,8 +38,8 @@ public:
   CLTensor() = delete;
 
 public:
-  CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<tflite::gpu::cl::Environment> environment,
-           TensorType type);
+  CLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape,
+           tflite::gpu::TensorDescriptor desc);
 
 public:
   const tflite::gpu::cl::Tensor *handle() const override;
diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc
index a95f78056..ef71bbc13 100644
--- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc
+++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc
@@ -43,8 +43,10 @@ void ICLTensor::access(const std::function<void(ITensor &tensor)> &fn)
   fn(*this);
 }
 
-void ICLTensor::writeConvertInit()
+void ICLTensor::writeConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder,
+                                 std::shared_ptr<tflite::gpu::cl::Environment> environment)
 {
+  _environment = environment;
   TensorObjectDef input_def;
   input_def.dimensions.b = handle()->Batch();
   input_def.dimensions.h = handle()->Height();
@@ -74,21 +76,20 @@ void ICLTensor::writeConvertInit()
   output_def.object_def.data_type = handle()->GetDataType();
   input_def.object_def.user_provided = false;
 
-  _converter_builder = NewConverterBuilder(_environment.get());
-  if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_to).ok())
+  if (!converter_builder->MakeConverter(input_def, permute_def, &_converter_to).ok())
   {
     throw std::runtime_error("Failed to make converter_to");
   }
-  if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_from).ok())
+  if (!converter_builder->MakeConverter(permute_def, output_def, &_converter_from).ok())
   {
     throw std::runtime_error("Failed to make converter_from");
   }
 }
 
-void ICLTensor::readConvertInit()
+void ICLTensor::readConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder,
+                                std::shared_ptr<tflite::gpu::cl::Environment> environment)
 {
-  _converter_builder = NewConverterBuilder(_environment.get());
-
+  _environment = environment;
   TensorObjectDef input_def;
   input_def.dimensions.b = handle()->Batch();
   input_def.dimensions.h = handle()->Height();
@@ -118,20 +119,20 @@ void ICLTensor::readConvertInit()
   TensorObjectDef output_def = permute_def;
   output_def.object_def.object_type = ObjectType::CPU_MEMORY;
 
-  if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_from).ok())
+  if (!converter_builder->MakeConverter(input_def, permute_def, &_converter_from).ok())
   {
     throw std::runtime_error("Failed to make converter_from");
   }
-  if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_to).ok())
+  if (!converter_builder->MakeConverter(permute_def, output_def, &_converter_to).ok())
   {
     throw std::runtime_error("Failed to make converter_to");
   }
 }
 
-void ICLTensor::enqueueWriteBuffer(const void *ptr, bool)
+void ICLTensor::enqueueWriteBuffer(const void *ptr, bool blocking)
 {
-  TensorObject input_obj =
-    MakeReadableCpuMemory(absl::MakeSpan(static_cast<const float *>(ptr), _shape.num_elements()));
+  TensorObject input_obj = MakeReadableCpuMemory(
+    absl::MakeSpan(static_cast<const float *>(ptr), _info._shape.DimensionsProduct()));
 
   TensorObject output_obj;
 
@@ -162,13 +163,19 @@ void ICLTensor::enqueueWriteBuffer(const void *ptr, bool)
   {
     throw std::runtime_error("Failed to write cl buffer from cpu memory");
   }
+
+  if (blocking && !_environment->queue()->WaitForCompletion().ok())
+  {
+    throw std::runtime_error("Failed to WaitForCompletion");
+  }
+
   if (!_converter_from->Convert(permute_obj, output_obj).ok())
   {
     throw std::runtime_error("Failed to change layout");
   }
 }
 
-void ICLTensor::enqueueReadBuffer(void *ptr, bool)
+void ICLTensor::enqueueReadBuffer(void *ptr, bool blocking)
 {
   TensorObject input_obj;
 
@@ -196,7 +203,7 @@ void ICLTensor::enqueueReadBuffer(void *ptr, bool)
   }
 
   TensorObject output_obj =
-    MakeCpuMemory(absl::MakeSpan(static_cast<float *>(ptr), _shape.num_elements()));
+    MakeCpuMemory(absl::MakeSpan(static_cast<float *>(ptr), _info._shape.DimensionsProduct()));
 
   if (!_converter_from->Convert(input_obj, permute_obj).ok())
   {
@@ -206,6 +213,11 @@ void ICLTensor::enqueueReadBuffer(void *ptr, bool)
   {
     throw std::runtime_error("Failed to read cl buffer");
   }
+
+  if (blocking && !_environment->queue()->WaitForCompletion().ok())
+  {
+    throw std::runtime_error("Failed to WaitForCompletion");
+  }
 }
 
 } // namespace operand
diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h
index b8ad4469f..47420a1c2 100644
--- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h
+++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h
@@ -26,7 +26,7 @@
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 
-#include "TensorBuilderHelper.h"
+#include "Utils.h"
 
 namespace onert
 {
@@ -37,6 +37,12 @@ namespace gpu_cl
 namespace operand
 {
 
+struct TensorInfo
+{
+  tflite::gpu::BHWC _shape;
+  tflite::gpu::TensorDescriptor _desc;
+};
+
 class ICLTensor : public ITensor
 {
 public:
@@ -46,15 +52,15 @@ public:
   ICLTensor(ICLTensor &&) = default;
   ICLTensor &operator=(ICLTensor &&) = default;
 
-  ICLTensor(size_t rank, ir::Shape shape, std::shared_ptr<tflite::gpu::cl::Environment> environment,
-            TensorType type)
-    : _rank{rank}, _shape{shape}, _environment(environment), _type(type)
+  ICLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape,
+            tflite::gpu::TensorDescriptor desc)
+    : _rank{rank}, _type(type), _info{shape, desc}
   {
   }
 
 public:
   uint8_t *buffer() const final { return reinterpret_cast<uint8_t *>(handle()->GetMemoryPtr()); }
-  size_t total_size() const final { return _shape.num_elements() * sizeof(float); }
+  size_t total_size() const final { return _info._shape.DimensionsProduct() * sizeof(float); }
   size_t calcOffset(const ir::Coordinates &) const final
   {
     throw std::runtime_error("ICLTensor::calcOffset() is not supported.");
@@ -78,16 +84,38 @@ public:
     throw std::runtime_error("ICLTensor::data_zero_points() is not supported.");
   }
   bool is_dynamic() const override { return false; }
-  ir::Shape getShape() const override { return _shape; }
+  ir::Shape getShape() const override
+  {
+    tflite::gpu::BHWC shape = _info._shape;
+    switch (_rank)
+    {
+      case 1:
+        return ir::Shape{shape.b};
+      case 2:
+        return ir::Shape{shape.b, shape.c};
+      case 3:
+        return ir::Shape{shape.b, shape.w, shape.c};
+      case 4:
+        return ir::Shape{shape.b, shape.h, shape.w, shape.c};
+      default:
+        break;
+    }
+    return ir::Shape{};
+  }
   bool has_padding() const override { return false; }
   void access(const std::function<void(ITensor &tensor)> &fn) final;
   bool needMemoryMap() const final { return true; }
   void enqueueWriteBuffer(const void *ptr, bool blocking = true) final;
   void enqueueReadBuffer(void *ptr, bool blocking = true) final;
 
-  void writeConvertInit();
-  void readConvertInit();
+  void writeConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder,
+                        std::shared_ptr<tflite::gpu::cl::Environment> environment);
+  void readConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder,
+                       std::shared_ptr<tflite::gpu::cl::Environment> environment);
+
   TensorType get_type() { return _type; }
+  TensorType set_type(TensorType type) { return _type = type; }
+  const TensorInfo get_info() { return _info; }
 
 public:
   virtual const tflite::gpu::cl::Tensor *handle() const = 0;
@@ -96,11 +124,10 @@ public:
 private:
 protected:
   size_t _rank; // Actual rank (reflects extended rank)
-  ir::Shape _shape;
-  std::shared_ptr<tflite::gpu::cl::Environment> _environment;
   TensorType _type;
-  std::unique_ptr<tflite::gpu::TensorObjectConverterBuilder> _converter_builder;
+  TensorInfo _info;
   tflite::gpu::cl::CLMemory _cl_memory;
+  std::shared_ptr<tflite::gpu::cl::Environment> _environment;
   std::unique_ptr<tflite::gpu::TensorObjectConverter> _converter_to;
   std::unique_ptr<tflite::gpu::TensorObjectConverter> _converter_from;
 };
diff --git a/runtime/onert/backend/ruy/BackendContext.cc b/runtime/onert/backend/ruy/BackendContext.cc
index 877772619..48da91b50 100644
--- a/runtime/onert/backend/ruy/BackendContext.cc
+++ b/runtime/onert/backend/ruy/BackendContext.cc
@@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels()
     .operands()
     .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-  for (auto &it : ret)
+  for (auto &&it : ret)
   {
     auto &fn_seq = it.second;
     fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
diff --git a/runtime/onert/backend/trix/BackendContext.cc b/runtime/onert/backend/trix/BackendContext.cc
index e46b11d20..39048f2be 100644
--- a/runtime/onert/backend/trix/BackendContext.cc
+++ b/runtime/onert/backend/trix/BackendContext.cc
@@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels()
     .operands()
     .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-  for (auto &it : ret)
+  for (auto &&it : ret)
   {
     auto &fn_seq = it.second;
     fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
diff --git a/runtime/onert/backend/trix/BatchThreadPool.cc b/runtime/onert/backend/trix/BatchThreadPool.cc
new file mode 100644
index 000000000..3c2001d75
--- /dev/null
+++ b/runtime/onert/backend/trix/BatchThreadPool.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BatchThreadPool.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace trix
+{
+
+BatchThreadPool::BatchThreadPool(size_t num_threads) : _num_threads(num_threads), _stop_all(false)
+{
+  _worker_threads.reserve(_num_threads);
+  for (uint32_t thread_num = 0; thread_num < _num_threads; ++thread_num)
+  {
+    _worker_threads.emplace_back([this, thread_num]() { this->worker(thread_num); });
+  }
+}
+
+void BatchThreadPool::worker(uint32_t thread_num)
+{
+  while (true)
+  {
+    std::unique_lock<std::mutex> lock(_m_job_queue);
+    _cv_job_queue.wait(lock, [this]() { return !this->_job_queue.empty() || _stop_all; });
+    if (_stop_all && this->_job_queue.empty())
+    {
+      return;
+    }
+
+    // Pop a job in front of queue
+    auto job = std::move(_job_queue.front());
+    _job_queue.pop();
+    lock.unlock();
+
+    // Run the job
+    job(thread_num);
+  }
+}
+
+BatchThreadPool::~BatchThreadPool()
+{
+  _stop_all = true;
+  _cv_job_queue.notify_all();
+
+  for (auto &&t : _worker_threads)
+  {
+    t.join();
+  }
+}
+
+} // namespace trix
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/trix/BatchThreadPool.h b/runtime/onert/backend/trix/BatchThreadPool.h
new file mode 100644
index 000000000..bc2936fb4
--- /dev/null
+++ b/runtime/onert/backend/trix/BatchThreadPool.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__
+#define __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+namespace onert
+{
+namespace backend
+{
+namespace trix
+{
+
+/**
+ * @brief Class that has a threadpool for batch-by-batch multi-threading
+ *
+ */
+class BatchThreadPool
+{
+public:
+  BatchThreadPool(size_t num_threads);
+  ~BatchThreadPool();
+
+  /**
+   * @brief
+   *
+   * @tparam F    Type of the function for job
+   * @tparam Args Type of arguments  of job
+   * @param f     Function for job
+   * @param args  Arguments of job
+   * @return std::future<typename std::result_of<F(uint32_t, Args...)>::type>
+   */
+  template <class F, class... Args>
+  std::future<typename std::result_of<F(uint32_t, Args...)>::type> enqueueJob(F &&f,
+                                                                              Args &&... args)
+  {
+    if (_stop_all)
+    {
+      throw std::runtime_error("Stop all threads in BatchThreadPool");
+    }
+
+    using return_type = typename std::result_of<F(uint32_t, Args...)>::type;
+    auto job = std::make_shared<std::packaged_task<return_type(uint32_t)>>(
+      std::bind(std::forward<F>(f), std::placeholders::_1, std::forward<Args>(args)...));
+    std::future<return_type> job_result_future = job->get_future();
+    {
+      // Push job in the assigned queue
+      std::lock_guard<std::mutex> lock(_m_job_queue);
+
+      // Push job
+      _job_queue.push([job](uint32_t thread_num) { (*job)(thread_num); });
+    }
+    _cv_job_queue.notify_one();
+
+    return job_result_future;
+  }
+
+private:
+  /**
+   * @brief Worker to run jobs
+   *
+   * @param thread_num Thread number on which worker is running
+   */
+  void worker(uint32_t thread_num);
+
+private:
+  /**
+   * @brief The number of threads
+   *
+   */
+  size_t _num_threads;
+
+  /**
+   * @brief Threads worked for jobs
+   *
+   */
+  std::vector<std::thread> _worker_threads;
+
+  /**
+   * @brief Queue for jobs
+   *
+   */
+  std::queue<std::function<void(uint32_t)>> _job_queue;
+
+  /**
+   * @brief condition_variables for _job_queue and _worker_threads
+   *
+   */
+  std::condition_variable _cv_job_queue;
+
+  /**
+   * @brief Mutex for the queue _job_queue
+   *
+   */
+  std::mutex _m_job_queue;
+
+  /**
+   * @brief Whether all threads are stopped
+   *
+   */
+  bool _stop_all;
+};
+
+} // namespace trix
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__
diff --git a/runtime/onert/backend/trix/Convert.cc b/runtime/onert/backend/trix/Convert.cc
new file mode 100644
index 000000000..fe003e7ea
--- /dev/null
+++ b/runtime/onert/backend/trix/Convert.cc
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Convert.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace trix
+{
+
+data_layout convertDataLayout(const ir::Layout layout)
+{
+  switch (layout)
+  {
+    case ir::Layout::NCHW:
+      return DATA_LAYOUT_NCHW;
+    case ir::Layout::NHWC:
+      return DATA_LAYOUT_NHWC;
+    default:
+      throw std::runtime_error("Unknown Layout");
+  }
+}
+
+data_type convertDataType(const ir::DataType type)
+{
+  switch (type)
+  {
+    case ir::DataType::QUANT_UINT8_ASYMM:
+      return DATA_TYPE_QASYMM8;
+    case ir::DataType::QUANT_INT16_SYMM:
+      return DATA_TYPE_QSYMM16;
+    default:
+      throw std::runtime_error("Unsupported data type");
+  }
+}
+
+} // namespace trix
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/trix/Convert.h b/runtime/onert/backend/trix/Convert.h
new file mode 100644
index 000000000..662ed44b6
--- /dev/null
+++ b/runtime/onert/backend/trix/Convert.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_TRIX_CONVERT_H__
+#define __ONERT_BACKEND_TRIX_CONVERT_H__
+
+#include <backend/IPortableTensor.h>
+#include <ir/DataType.h>
+#include <ir/Layout.h>
+
+#include <libnpuhost.h>
+#include <type_traits>
+
+namespace onert
+{
+namespace backend
+{
+namespace trix
+{
+
+/**
+ * @brief Convert type of layout from onert type to npu type
+ *
+ * @param layout Layout type in onert
+ * @return data_layout Layout type in npu
+ */
+data_layout convertDataLayout(const ir::Layout layout);
+
+/**
+ * @brief Convert type of data from onert type to npu type
+ *
+ * @param type Data type in onert
+ * @return data_type Data type in npu
+ */
+data_type convertDataType(const ir::DataType type);
+
+/**
+ * @brief Set the tensors_data_info object
+ *
+ * @tparam T Type of tensor based of IPortableTensor
+ * @param tensors Tensors that have data information
+ * @param info    tensors_data_info to be set
+ */
+template <typename T, std::enable_if_t<std::is_base_of<IPortableTensor, T>::value, bool> = true>
+void setDataInfo(const std::vector<T *> &tensors, tensors_data_info *info)
+{
+  info->num_info = static_cast<uint32_t>(tensors.size());
+
+  for (uint32_t idx = 0; idx < info->num_info; ++idx)
+  {
+    info->info[idx].layout = convertDataLayout(tensors[idx]->layout());
+    info->info[idx].type = convertDataType(tensors[idx]->data_type());
+  }
+}
+
+/**
+ * @brief Set the generic_buffers object
+ *
+ * @tparam T Type of tensor based of IPortableTensor
+ * @param tensors Tensors that have buffer information
+ * @param buf     generic_buffers to be set
+ */
+template <typename T, std::enable_if_t<std::is_base_of<IPortableTensor, T>::value, bool> = true>
+void setBuffers(const std::vector<T *> &tensors, generic_buffers *buf)
+{
+  buf->num_buffers = static_cast<uint32_t>(tensors.size());
+
+  for (uint32_t idx = 0; idx < buf->num_buffers; ++idx)
+  {
+    buf->bufs[idx].addr = tensors[idx]->buffer();
+    buf->bufs[idx].size = static_cast<uint64_t>(tensors[idx]->total_size());
+    buf->bufs[idx].type = BUFFER_MAPPED;
+  }
+}
+
+} // namespace trix
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_TRIX_CONVERT_H__
diff --git a/runtime/onert/backend/trix/DevContext.cc b/runtime/onert/backend/trix/DevContext.cc
new file mode 100644
index 000000000..059514878
--- /dev/null
+++ b/runtime/onert/backend/trix/DevContext.cc
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DevContext.h"
+
+#include "Convert.h"
+
+#include <stdexcept>
+
+namespace onert
+{
+namespace backend
+{
+namespace trix
+{
+
+// All things related to npu device handle are gathered this Class, but when implementing npu
+// deamon, others except the context roles should be seperated.
+DevContext::DevContext() : _dev_handles{}, _model_ids{}, _meta_map{}
+{
+  auto dev_count = getnumNPUdeviceByType(NPUCOND_TRIV2_CONN_SOCIP);
+  if (dev_count <= 0)
+  {
+    throw std::runtime_error("Unable to find TRIX NPU device");
+  }
+
+  // Get NPU device handles
+  for (int i = 0; i < dev_count; ++i)
+  {
+    npudev_h handle;
+    if (getNPUdeviceByType(&handle, NPUCOND_TRIV2_CONN_SOCIP, i) < 0)
+    {
+      throw std::runtime_error("Failed to get TRIX NPU device handle");
+    }
+    _dev_handles.emplace_back(handle);
+  }
+
+  // NOTE Do not change the number of threads as long as jobs in thread call
+  //      the synchronous APIs such as submitNPU_request()
+  _batch_thread_pool = std::make_unique<BatchThreadPool>(_dev_handles.size());
+  // We need to careful not to create multiple `BatchThreadPool`. In case of multiple models, there
+  // may be a problem having multiple `BatchThreadPool` in current implementation. But if this
+  // creating thread pool is moved to npu deamon, I think this problem will be solved smoothly.
+}
+
+DevContext::~DevContext()
+{
+  // NOTE Must release _batch_thread_pool before releasing _dev_handles to wait for all threads to
+  //      be terminated
+  _batch_thread_pool.reset(nullptr);
+
+  for (const auto &dev_handle : _dev_handles)
+  {
+    unregisterNPUmodel_all(dev_handle);
+    putNPUdevice(dev_handle);
+  }
+}
+
+ModelID DevContext::registerModel(const std::string &model_file_path)
+{
+  auto meta = getNPUmodel_metadata(model_file_path.c_str(), false);
+
+  if (meta == nullptr)
+  {
+    throw std::runtime_error("Unable to extract the model metadata");
+  }
+
+  generic_buffer file_info;
+  file_info.type = BUFFER_FILE;
+  file_info.filepath = model_file_path.c_str();
+  file_info.size = meta->size;
+
+  ModelID model_id;
+
+  for (uint32_t dev_num = 0; dev_num < _dev_handles.size(); ++dev_num)
+  {
+    // Register model for each device
+    uint32_t model_id_at_device;
+    if (registerNPUmodel(_dev_handles.at(dev_num), &file_info, &model_id_at_device) < 0)
+    {
+      throw std::runtime_error("Failed to register npu model");
+    }
+
+    if (dev_num == 0)
+    {
+      model_id = model_id_at_device;
+      _meta_map[model_id_at_device] = std::shared_ptr<npubin_meta>(meta);
+    }
+    else
+    {
+      _meta_map[model_id_at_device] = _meta_map[model_id];
+    }
+
+    _model_ids[model_id].resize(dev_num + 1);
+    _model_ids[model_id].at(dev_num) = model_id_at_device;
+  }
+
+  // Return the model id for device 0 only
+  return model_id;
+}
+
+void DevContext::unRegisterModel(ModelID model_id)
+{
+  for (uint32_t dev_num = 0; dev_num < _dev_handles.size(); ++dev_num)
+  {
+    const auto model_id_at_device = _model_ids.at(model_id).at(dev_num);
+    const auto &dev_handle = _dev_handles.at(dev_num);
+
+    // Remove meta data
+    _meta_map.erase(model_id_at_device);
+
+    // Unregister Model for each device
+    unregisterNPUmodel(dev_handle, model_id_at_device);
+  }
+  // Remove model IDs
+  _model_ids.erase(model_id);
+}
+
+void DevContext::requestRun(ModelID model_id, input_buffers *input_bufs, tensors_data_info *in_info,
+                            output_buffers *output_bufs, tensors_data_info *out_info,
+                            size_t batch_size)
+{
+  if (batch_size > 1)
+  {
+    if (in_info->num_info != 1)
+    {
+      throw std::runtime_error("Supported only an input that has batch now");
+    }
+    if (out_info->num_info != 1)
+    {
+      throw std::runtime_error("Supported only one output now");
+    }
+
+    if (input_bufs->bufs[0].size % batch_size != 0)
+    {
+      throw std::runtime_error("Invalid batch size. batch size :" + std::to_string(batch_size) +
+                               ", input buffer size : " + std::to_string(input_bufs->bufs[0].size));
+    }
+
+    if (output_bufs->bufs[0].size % batch_size != 0)
+    {
+      throw std::runtime_error(
+        "Invalid batch size. batch size :" + std::to_string(batch_size) +
+        ", output tensor size : " + std::to_string(output_bufs->bufs[0].size));
+    }
+
+    // inputs/outputs for each batch
+    std::vector<input_buffers> in_buffers_vec(batch_size);
+    std::vector<output_buffers> out_buffers_vec(batch_size);
+
+    // Run on thread pool
+    std::vector<std::future<int32_t>> batch_futures;
+    for (uint32_t batch_num = 0; batch_num < batch_size; ++batch_num)
+    {
+      // Enqueue jobs
+      // The in_info and out_info are always the same even if they are divided by batch, so they are
+      // used as they are.
+      auto future = _batch_thread_pool->enqueueJob(
+        [batch_size, in_info, out_info,
+         this](uint32_t dev_num, ModelID model_id, const input_buffers *input_bufs,
+               const output_buffers *output_bufs, uint32_t batch_num) -> int32_t {
+          // Set buffers of inputs/outputs for each batch
+          // TODO Support multiple inputs/outputs
+          input_buffers in_batch_buffers;
+          in_batch_buffers.num_buffers = input_bufs->num_buffers;
+          const uint64_t in_batch_offset = input_bufs->bufs[0].size / batch_size;
+          setBufferByBatch(input_bufs->bufs[0], batch_num, in_batch_offset,
+                           &in_batch_buffers.bufs[0]);
+
+          output_buffers out_batch_buffers;
+          out_batch_buffers.num_buffers = output_bufs->num_buffers;
+          const uint64_t out_batch_offset = output_bufs->bufs[0].size / batch_size;
+          setBufferByBatch(output_bufs->bufs[0], batch_num, out_batch_offset,
+                           &out_batch_buffers.bufs[0]);
+
+          try
+          {
+            // dev_num is the same as the thread number in _batch_thread_pool
+            this->runOneBatch(dev_num, model_id, &in_batch_buffers, in_info, &out_batch_buffers,
+                              out_info);
+          }
+          catch (...)
+          {
+            _eptr = std::current_exception();
+          }
+
+          return batch_num;
+        },
+        model_id, input_bufs, output_bufs, batch_num);
+      batch_futures.emplace_back(std::move(future));
+    }
+
+    for (auto &&future : batch_futures)
+    {
+      future.get();
+    }
+
+    if (_eptr)
+    {
+      std::exception_ptr eptr(nullptr);
+      _eptr.swap(eptr);
+      std::rethrow_exception(eptr);
+    }
+  }
+  else
+  {
+    runOneBatch(0, model_id, input_bufs, in_info, output_bufs, out_info);
+  }
+}
+
+void DevContext::runOneBatch(uint32_t dev_num, ModelID model_id, input_buffers *input_bufs,
+                             tensors_data_info *in_info, output_buffers *output_bufs,
+                             tensors_data_info *out_info)
+{
+  const auto &model_id_at_device = _model_ids.at(model_id).at(dev_num);
+
+  const auto meta = _meta_map.at(model_id_at_device);
+  if (meta->input_seg_num != in_info->num_info)
+  {
+    throw std::runtime_error("The number of inputs does not match to model input seg num");
+  }
+
+  if (meta->output_seg_num != out_info->num_info)
+  {
+    throw std::runtime_error("The number of outputs does not match to model output seg num");
+  }
+
+  const auto &dev_handle = _dev_handles.at(dev_num);
+  int req_id;
+
+  if (auto error_code = createNPU_request(dev_handle, model_id_at_device, &req_id))
+  {
+    throw std::runtime_error("Unable to create NPU request with model id (" +
+                             std::to_string(model_id_at_device) + ")" +
+                             " error code : " + std::to_string(error_code));
+  }
+
+  if (auto error_code =
+        setNPU_requestData(dev_handle, req_id, input_bufs, in_info, output_bufs, out_info))
+  {
+    removeNPU_request(dev_handle, req_id);
+    throw std::runtime_error("Unable to create NPU request for model id (" +
+                             std::to_string(model_id_at_device) + ")" +
+                             " error code : " + std::to_string(error_code));
+  }
+
+  // NOTE submitNPU_request is not thread-safe(?). It is rarely hanging(unresponsive).
+  //      Ultimately, to solve this problem, we have to either use other thread-safe API or
+  //      change submitNPU_request to be thread-safe, but both works take time.
+  //      As a workaround, let's allow hanging thread.
+  // TODO Change submitNPU_request to be thread-safe or replaced with other thread-safe API
+  std::packaged_task<int(npudev_h, int)> task(submitNPU_request);
+  auto f = task.get_future();
+  std::thread thread_submit_request(std::move(task), dev_handle, req_id);
+  auto status = f.wait_until(std::chrono::system_clock::now() + std::chrono::seconds(60));
+  if (status == std::future_status::timeout)
+  {
+    // There is no way to terminate hanging submitNPU_request from the outside.
+    // If a hanging thread is detached, it will remain as a hanging thread. Even so, it's better
+    // than having the main thread hanging.
+    thread_submit_request.detach();
+
+    // TODO Enable removeNPU_request after resolving hanging.
+    // removeNPU_request(dev_handle, req_id);
+    throw std::runtime_error("The npu API \"submitNPU_request\" timeout");
+  }
+
+  auto error_code = f.get();
+  thread_submit_request.join();
+  if (error_code != 0)
+  {
+    removeNPU_request(dev_handle, req_id);
+    throw std::runtime_error("Unable to submit NPU request with req id (" + std::to_string(req_id) +
+                             ")" + " error code : " + std::to_string(error_code));
+  }
+
+  if (auto error_code = removeNPU_request(dev_handle, req_id))
+  {
+    throw std::runtime_error("Unable to remove NPU request with req id (" + std::to_string(req_id) +
+                             ")" + " error code : " + std::to_string(error_code));
+  }
+}
+
+void DevContext::setBufferByBatch(const generic_buffer &origin_buf, uint32_t batch_num,
+                                  uint64_t batch_offset, generic_buffer *batch_buf)
+{
+  batch_buf->addr = reinterpret_cast<uint8_t *>(origin_buf.addr) + batch_num * batch_offset;
+  batch_buf->size = batch_offset;
+  batch_buf->type = BUFFER_MAPPED;
+}
+
+} // namespace trix
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/trix/DevContext.h b/runtime/onert/backend/trix/DevContext.h
index a7dbd7a59..cd8de97e6 100644
--- a/runtime/onert/backend/trix/DevContext.h
+++ b/runtime/onert/backend/trix/DevContext.h
@@ -17,7 +17,12 @@
 #ifndef __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__
 #define __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__
 
+#include "BatchThreadPool.h"
+
 #include <libnpuhost.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
 
 namespace onert
 {
@@ -26,103 +31,117 @@ namespace backend
 namespace trix
 {
 
+using ModelID = uint32_t;
+
+/**
+ * @brief NPU device context of trix backend
+ *
+ */
 class DevContext
 {
 public:
-  DevContext()
-  {
-    auto device_count = getnumNPUdeviceByType(NPUCOND_TRIV2_CONN_SOCIP);
-    // TODO: x64 platform has 3 cores. We do not support more that 2 cores for now.
-    if (device_count > 2)
-    {
-      device_count = 2;
-    }
-
-    if (device_count <= 0)
-    {
-      throw std::runtime_error("Unable to find TRIX NPU device");
-    }
-
-    for (int i = 0; i < device_count; i++)
-    {
-      npudev_h h;
-      if (getNPUdeviceByType(&h, NPUCOND_TRIV2_CONN_SOCIP, i) < 0)
-      {
-        throw std::runtime_error("Failed to get TRIX NPU device handle");
-      }
-      _dev_handles.push_back(h);
-    }
-  }
-
-  ~DevContext()
-  {
-    for (auto h : _dev_handles)
-    {
-      if (h != nullptr)
-      {
-        unregisterNPUmodel_all(h);
-        putNPUdevice(h);
-      }
-    }
-  }
-
-  npudev_h getDev(int i) { return _dev_handles[i]; }
-  int getDevSize() { return _dev_handles.size(); }
-
-  template <typename T> void setDataInfo(tensors_data_info *info, std::vector<T *> &tensors)
-  {
-    info->num_info = static_cast<uint32_t>(tensors.size());
-
-    for (uint32_t idx = 0; idx < info->num_info; ++idx)
-    {
-      info->info[idx].layout = convertDataLayout(tensors[idx]->layout());
-      info->info[idx].type = convertDataType(tensors[idx]->data_type());
-    }
-  }
-
-  template <typename T>
-  void setBuffer(generic_buffers *buf, std::vector<T *> &tensors, int batch_size, int batch_index)
-  {
-    buf->num_buffers = static_cast<uint32_t>(tensors.size());
-
-    for (uint32_t idx = 0; idx < buf->num_buffers; ++idx)
-    {
-      buf->bufs[idx].size = static_cast<uint64_t>(tensors[idx]->total_size() / batch_size);
-      buf->bufs[idx].addr = tensors[idx]->buffer() + (batch_index * buf->bufs[idx].size);
-      buf->bufs[idx].type = BUFFER_MAPPED;
-    }
-  }
+  /**
+   * @brief Construct a new device Context object
+   *
+   */
+  DevContext();
+
+  /**
+   * @brief Destroy the device Context object
+   *
+   */
+  ~DevContext();
+
+  DevContext(const DevContext &) = delete;
+  DevContext &operator=(const DevContext &) = delete;
+
+  /**
+   * @brief Register a trix model for all NPU devices
+   *
+   * @param model_file_path File path of a trix model
+   * @return ModelID Internal ID of the trix model
+   */
+  ModelID registerModel(const std::string &model_file_path);
+
+  /**
+   * @brief Unregister a trix model
+   *
+   * @param model_id Internal ID of the trix model to be unregistered
+   */
+  void unRegisterModel(ModelID model_id);
+
+  /**
+   * @brief Request a trix model to be run on NPU
+   *
+   * @param model_id    Internal ID of a trix model
+   * @param input_bufs  Buffer data of inputs
+   * @param in_info     Data info of inputs
+   * @param output_bufs Buffer data of outputs
+   * @param out_info    data info of outputs
+   * @param batch_size  Batch size
+   */
+  void requestRun(ModelID model_id, input_buffers *input_bufs, tensors_data_info *in_info,
+                  output_buffers *output_bufs, tensors_data_info *out_info, size_t batch_size);
 
 private:
-  data_layout convertDataLayout(const ir::Layout layout)
-  {
-    switch (layout)
-    {
-      case ir::Layout::NCHW:
-        return DATA_LAYOUT_NCHW;
-      case ir::Layout::NHWC:
-        return DATA_LAYOUT_NHWC;
-      default:
-        throw std::runtime_error("Unknown Layout");
-    }
-  }
-
-  data_type convertDataType(const ir::DataType type)
-  {
-    switch (type)
-    {
-      case ir::DataType::QUANT_UINT8_ASYMM:
-        return DATA_TYPE_QASYMM8;
-      case ir::DataType::QUANT_INT16_SYMM:
-        return DATA_TYPE_QSYMM16;
-      default:
-        throw std::runtime_error("Unsupported data type");
-    }
-  }
+  /**
+   * @brief Rquest one batch of a trix model to be run on a device of NPU
+   *
+   * @param dev_num     Device number
+   * @param model_id    Internal ID of a trix model
+   * @param input_bufs  Buffer data of inputs
+   * @param in_info     Data info of inputs
+   * @param output_bufs Buffer data of outputs
+   * @param out_info    data info of outputs
+   */
+  void runOneBatch(uint32_t dev_num, ModelID model_id, input_buffers *input_bufs,
+                   tensors_data_info *in_info, output_buffers *output_bufs,
+                   tensors_data_info *out_info);
+
+  /**
+   * @brief Set the buffer object by batch
+   *
+   * @param origin_buf   Buffer object that has all batches
+   * @param batch_num    Batch number
+   * @param batch_offset Size of a batch
+   * @param batch_buf    One batch buffer object to be set
+   */
+  void setBufferByBatch(const generic_buffer &origin_buf, uint32_t batch_num, uint64_t batch_offset,
+                        generic_buffer *batch_buf);
 
 private:
-  // NPU device handles
+  /**
+   * @brief NPU device handles
+   *
+   */
   std::vector<npudev_h> _dev_handles;
+
+  /**
+   * @brief Threadpool for batch-by-batch multi-threading
+   *
+   */
+  std::unique_ptr<BatchThreadPool> _batch_thread_pool;
+
+  // TODO Change key to internal trix model context(?) if it is needed
+  /**
+   * @brief Map for ID of models
+   *        Internal Model ID : Model ID array for each device
+   *
+   */
+  std::unordered_map<ModelID, std::vector<uint32_t>> _model_ids;
+
+  /**
+   * @brief Map for meta data
+   *        Model ID at each device : meta data
+   *
+   */
+  std::unordered_map<uint32_t, std::shared_ptr<npubin_meta>> _meta_map;
+
+  /**
+   * @brief Exception pointer captured whthin threads
+   *
+   */
+  std::exception_ptr _eptr;
 };
 
 } // namespace trix
diff --git a/runtime/onert/backend/trix/KernelGenerator.cc b/runtime/onert/backend/trix/KernelGenerator.cc
index 68e6840dd..2783bd75b 100644
--- a/runtime/onert/backend/trix/KernelGenerator.cc
+++ b/runtime/onert/backend/trix/KernelGenerator.cc
@@ -61,11 +61,11 @@ void KernelGenerator::visit(const ir::operation::Bulk &node)
   using ir::operation::Bulk;
 
   std::vector<IPortableTensor *> output_tensors;
-  for (auto &ofm_idx : node.getOutputs())
+  for (const auto &ofm_idx : node.getOutputs())
     output_tensors.emplace_back(_tensor_reg->getPortableTensor(ofm_idx));
 
   std::vector<const IPortableTensor *> input_tensors;
-  for (auto &ifm_idx : node.getInputs())
+  for (const auto &ifm_idx : node.getInputs())
     input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
 
   // parameters
diff --git a/runtime/onert/backend/trix/ops/BulkLayer.cc b/runtime/onert/backend/trix/ops/BulkLayer.cc
index 3c49da9a3..db5c81ba7 100644
--- a/runtime/onert/backend/trix/ops/BulkLayer.cc
+++ b/runtime/onert/backend/trix/ops/BulkLayer.cc
@@ -15,10 +15,8 @@
  */
 
 #include "BulkLayer.h"
-#include <util/logging.h>
 
-#include <libnpuhost.h>
-#include <future>
+#include "../Convert.h"
 
 namespace onert
 {
@@ -29,12 +27,12 @@ namespace trix
 namespace ops
 {
 
-BulkLayer::BulkLayer() : _inputs(), _outputs(), _model_id(0), _meta(nullptr), _dev_context(nullptr)
+BulkLayer::BulkLayer() : _inputs(), _outputs(), _model_id(0), _dev_context(nullptr)
 {
   // DO NOTHING
 }
 
-BulkLayer::~BulkLayer() { free(_meta); }
+BulkLayer::~BulkLayer() { _dev_context->unRegisterModel(_model_id); }
 
 void BulkLayer::configure(const std::vector<const IPortableTensor *> &inputs,
                           std::vector<IPortableTensor *> &outputs, std::string binary_path,
@@ -43,133 +41,28 @@ void BulkLayer::configure(const std::vector<const IPortableTensor *> &inputs,
   _inputs = inputs;
   _outputs = outputs;
   _dev_context = dev_context;
-
-  _meta = getNPUmodel_metadata(binary_path.c_str(), false);
-  if (_meta == nullptr)
-  {
-    throw std::runtime_error("Unable to extract the model metadata");
-  }
-
-  _model_id.resize(_dev_context->getDevSize());
-
-  generic_buffer model_file;
-  model_file.type = BUFFER_FILE;
-  model_file.filepath = binary_path.c_str();
-  model_file.size = _meta->size;
-
-  for (int i = 0; i < _dev_context->getDevSize(); i++)
-  {
-    if (registerNPUmodel(dev_context->getDev(i), &model_file, &_model_id[i]) < 0)
-    {
-      throw std::runtime_error("Failed to register npu model");
-    }
-  }
-}
-
-void single_job(npudev_h dev, int req_id, input_buffers *input_buf, tensors_data_info *in_info,
-                output_buffers *output_buf, tensors_data_info *out_info)
-{
-  if (setNPU_requestData(dev, req_id, input_buf, in_info, output_buf, out_info))
-  {
-    throw std::runtime_error("Unable to create NPU request for red_id (" + std::to_string(req_id) +
-                             ")");
-  }
-
-  if (submitNPU_request(dev, req_id))
-  {
-    throw std::runtime_error("Unable to submit NPU request with req id (" + std::to_string(req_id) +
-                             ")");
-  }
+  _model_id = _dev_context->registerModel(binary_path);
 }
 
 void BulkLayer::run()
 {
-  // TODO: Remove too many assumption
-  // We assume user wants batch execution if user's input size is multiples of model's input size
-  int user_input_batch = (_inputs[0]->get_info().shape()).dim(0);
-  int model_input_batch = _meta->input_seg_dims[0][0];
-  int batch_size = user_input_batch / model_input_batch;
-  bool is_batch_execution = (batch_size != 1 ? true : false);
-
-  std::vector<int> req_id(_dev_context->getDevSize());
-
-  for (int i = 0; i < _dev_context->getDevSize(); i++)
-  {
-    if (createNPU_request(_dev_context->getDev(i), _model_id[i], &req_id[i]))
-    {
-      throw std::runtime_error("Unable to create NPU request with model id (" +
-                               std::to_string(_model_id[i]) + ")");
-    }
-  }
-
-  if (_meta->input_seg_num != _inputs.size())
-  {
-    throw std::runtime_error("input size does not match to model input seg num");
-  }
-
-  if (_meta->output_seg_num != _outputs.size())
-  {
-    throw std::runtime_error("output size does not match to model output seg num");
-  }
-
   tensors_data_info in_info;
   tensors_data_info out_info;
-  _dev_context->setDataInfo<const IPortableTensor>(&in_info, _inputs);
-  _dev_context->setDataInfo<IPortableTensor>(&out_info, _outputs);
+  setDataInfo(_inputs, &in_info);
+  setDataInfo(_outputs, &out_info);
 
-  std::vector<input_buffers> input_buf;
-  std::vector<output_buffers> output_buf;
-  input_buf.resize(_dev_context->getDevSize());
-  output_buf.resize(_dev_context->getDevSize());
-
-  std::vector<std::future<void>> f(_dev_context->getDevSize());
-
-  const int num_cores = _dev_context->getDevSize();
-  if (is_batch_execution)
-  {
-    // TODO: Support for general number of cores(>2)
-    // Here we assume that 2 trix cores
-    for (int i = 0; i < (batch_size); i = i + num_cores)
-    {
-      for (int core = 0; core < num_cores; core++)
-      {
-        _dev_context->setBuffer<const IPortableTensor>(&input_buf[core], _inputs, batch_size,
-                                                       i + core);
-        _dev_context->setBuffer<IPortableTensor>(&output_buf[core], _outputs, batch_size, i + core);
-      }
-      for (int core = 0; core < num_cores; core++)
-      {
-
-        if (i + core < batch_size)
-        {
-          f[core] =
-            std::async(std::launch::async, &single_job, _dev_context->getDev(core), req_id[core],
-                       &input_buf[core], &in_info, &output_buf[core], &out_info);
-        }
-      }
-      for (int core = 0; core < num_cores; core++)
-      {
-        f[core].wait();
-      }
-    }
-  }
-  else
-  {
-    _dev_context->setBuffer<const IPortableTensor>(&input_buf[0], _inputs, batch_size, 0);
-    _dev_context->setBuffer<IPortableTensor>(&output_buf[0], _outputs, batch_size, 0);
-
-    single_job(_dev_context->getDev(0), req_id[0], &input_buf[0], &in_info, &output_buf[0],
-               &out_info);
-  }
+  input_buffers input_bufs;
+  output_buffers output_bufs;
+  setBuffers(_inputs, &input_bufs);
+  setBuffers(_outputs, &output_bufs);
 
-  for (int i = 0; i < _dev_context->getDevSize(); i++)
+  size_t batch_size = 1;
+  // TODO Remove this assumption
+  if (_inputs.size() == 1 && _outputs.size() == 1 && _inputs.at(0)->getShape().dim(0) > 1)
   {
-    if (removeNPU_request(_dev_context->getDev(i), req_id[i]))
-    {
-      throw std::runtime_error("Unable to remove NPU request with req id (" +
-                               std::to_string(req_id[i]) + ")");
-    }
+    batch_size = _inputs.at(0)->getShape().dim(0);
   }
+  _dev_context->requestRun(_model_id, &input_bufs, &in_info, &output_bufs, &out_info, batch_size);
 }
 
 void BulkLayer::prepare()
diff --git a/runtime/onert/backend/trix/ops/BulkLayer.h b/runtime/onert/backend/trix/ops/BulkLayer.h
index 614c0f728..6590b6989 100644
--- a/runtime/onert/backend/trix/ops/BulkLayer.h
+++ b/runtime/onert/backend/trix/ops/BulkLayer.h
@@ -50,8 +50,7 @@ private:
   std::vector<const IPortableTensor *> _inputs;
   std::vector<IPortableTensor *> _outputs;
 
-  std::vector<uint32_t> _model_id;
-  npubin_meta *_meta;
+  ModelID _model_id;
   std::shared_ptr<DevContext> _dev_context;
 };
 
diff --git a/runtime/onert/backend/xnnpack/BackendContext.cc b/runtime/onert/backend/xnnpack/BackendContext.cc
index 42fffb608..c52e275aa 100644
--- a/runtime/onert/backend/xnnpack/BackendContext.cc
+++ b/runtime/onert/backend/xnnpack/BackendContext.cc
@@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels()
     .operands()
     .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-  for (auto &it : ret)
+  for (auto &&it : ret)
   {
     auto &fn_seq = it.second;
     fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt
index 87c7a13e4..8041ab5bc 100644
--- a/runtime/onert/core/CMakeLists.txt
+++ b/runtime/onert/core/CMakeLists.txt
@@ -57,4 +57,4 @@ target_link_libraries(${TEST_ONERT_CORE} nnfw_coverage)
 target_link_libraries(${TEST_ONERT_CORE} gtest gtest_main dl ${LIB_PTHREAD})
 
 add_test(${TEST_ONERT_CORE} ${TEST_ONERT_CORE})
-install(TARGETS ${TEST_ONERT_CORE} DESTINATION unittest_standalone)
+install(TARGETS ${TEST_ONERT_CORE} DESTINATION unittest)
diff --git a/runtime/onert/core/include/backend/basic/BackendContextHelpers.h b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h
index cf2da4c34..970a9f71c 100644
--- a/runtime/onert/core/include/backend/basic/BackendContextHelpers.h
+++ b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h
@@ -84,19 +84,23 @@ template <typename T_BackendContext> void planTensors(const T_BackendContext &ct
     tensor_builder->notifyFirstUse(ind);
   }
 
-  for (auto &pair : def_map)
+  for (const auto &pair : def_map)
   {
-    if (pair.second == 0)
-      tensor_builder->notifyFirstUse(pair.first);
+    const auto &ind = pair.first;
+    const auto def_count = pair.second;
+    if (def_count == 0)
+      tensor_builder->notifyFirstUse(ind);
   }
 
   // This is a workaround to keep the operands over the execution
   // (the operands look like they are unused)
   std::vector<ir::OperandIndex> operands_last_until_end;
-  for (auto &pair : uses_map)
+  for (const auto &pair : uses_map)
   {
-    if (pair.second == 0)
-      operands_last_until_end.push_back(pair.first);
+    const auto &ind = pair.first;
+    const auto use_count = pair.second;
+    if (use_count == 0)
+      operands_last_until_end.push_back(ind);
   }
 
   // At each operation,
@@ -161,7 +165,7 @@ template <typename T_BackendContext> void planTensors(const T_BackendContext &ct
     }
   }
 
-  for (auto &ind : operands_last_until_end)
+  for (const auto &ind : operands_last_until_end)
   {
     tensor_builder->notifyLastUse(ind);
   }
diff --git a/runtime/onert/core/include/compiler/Compiler.h b/runtime/onert/core/include/compiler/Compiler.h
index f05d63c66..9a86f407e 100644
--- a/runtime/onert/core/include/compiler/Compiler.h
+++ b/runtime/onert/core/include/compiler/Compiler.h
@@ -22,76 +22,19 @@
 #ifndef __ONERT_COMPILER_COMPILE_H_
 #define __ONERT_COMPILER_COMPILE_H_
 
+#include "CompilerOptions.h"
+#include "ICompiler.h"
 #include "ir/NNPkg.h"
-#include "exec/Executors.h"
-#include "util/TracingCtx.h"
 
 namespace onert
 {
-
 namespace compiler
 {
 
-enum class State
-{
-  CREATED, // Before compilation
-  COMPILED // Success compilation
-};
-
-struct ManualSchedulerOptions
-{
-public:
-  void setBackendMap(const std::string &str);
-
-public:
-  std::string backend_for_all;
-  std::unordered_map<ir::OpCode, std::string> opcode_to_backend;
-  std::unordered_map<ir::OperationIndex, std::string> index_to_backend;
-};
-
-struct PartialGraphOptions
-{
-  std::unordered_map<ir::OperationIndex, ir::SubgraphIndex> index_to_graph;
-};
-
-class CompilerOptions
-{
-public:
-  // Set default values for CompilerOptions
-  // All these default values should not be fetched from Env, when we stop supporting Android NNAPI.
-  static std::unique_ptr<CompilerOptions> fromGlobalConfig();
-
-public:
-  // GENERAL OPTIONS
-  std::vector<std::string> backend_list;
-
-  // OPTIONS ONLY FOR DEBUGGING/PROFILING
-  std::string trace_filepath; //< File path to save trace records
-  int graph_dump_level;       //< Graph dump level, values between 0 and 2 are valid
-  std::string executor;       //< Executor name to use
-  ManualSchedulerOptions manual_scheduler_options; //< Options for ManualScheduler
-  bool he_scheduler;      //< HEScheduler if true, ManualScheduler otherwise
-  bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF
-  bool disable_compile;   //< Run with Interpreter if true, try compilation otherwise
-  bool fp16_enable;       //< Whether fp16 mode ON/OFF
-  PartialGraphOptions partial_graph_options;
-};
-
-struct CompilerArtifact
-{
-  CompilerArtifact(void) = delete;
-  CompilerArtifact(std::shared_ptr<exec::Executors> executors,
-                   std::unique_ptr<const util::TracingCtx> tracing_ctx)
-    : _executors{executors}, _tracing_ctx{std::move(tracing_ctx)} {};
-
-  std::shared_ptr<exec::Executors> _executors;
-  std::unique_ptr<const util::TracingCtx> _tracing_ctx;
-};
-
 /**
  * @brief Class to compile NN package
  */
-class Compiler
+class Compiler : public ICompiler
 {
 public:
   /**
@@ -109,55 +52,25 @@ public:
   Compiler(const std::shared_ptr<ir::NNPkg> &nnpkg,
            std::vector<std::unique_ptr<CompilerOptions>> &copts);
 
-public:
   /**
-   * @brief   Do compilation with the options
-   *
-   * @return std::shared_ptr<CompilerArtifact> Executors as a result of compilation
+   * @brief Destroy the Compiler object
    */
-  std::shared_ptr<CompilerArtifact> compile(void);
+  ~Compiler() = default;
 
+public:
   /**
    * @brief   Do compilation with the options
    *
-   * @return std::vector<std::shared_ptr<CompilerArtifact>> Executors as a result of compilation
-   * for pipeline
-   */
-  std::vector<std::shared_ptr<CompilerArtifact>> compile(const char *package_file_path,
-                                                         const char *map_file_path);
-
-  State state(void) const { return _state; }
-
-  /**
-   * @brief   Allow to compute float32 using float16 data type
-   */
-  void enableToFp16();
-
-  /**
-   * @brief   Build the partial graphs to compile with original graph
+   * @return std::shared_ptr<CompilerArtifact> Executors as a result of compilation
    */
-  bool buildPartialGraph(uint32_t num_graphs);
-
-private:
-  void checkProfilerConditions();
-  std::shared_ptr<ir::Graph> &primary_subgraph()
-  {
-    return _nnpkg->primary_model()->at(ir::SubgraphIndex{0});
-  }
+  std::shared_ptr<CompilerArtifact> compile(void);
 
 private:
-  std::shared_ptr<ir::NNPkg> _nnpkg;
-  // NOTE These executors does not have duplicated subgraph. This mean they do not allow support
-  // subgraphs being called recursively because data of non-constant tensor of parent executor will
-  // be updated by child executor. If you want to support subgraphs being called recursively, you
-  // have to add allocate non-constant tensor memory of executors in execution time when each
-  // subgraph is called.
-  State _state;
-  std::vector<CompilerOptions *> _voptions;
+  std::shared_ptr<ir::Model> _model;
+  CompilerOptions *_options;
 };
 
 } // namespace compiler
-
 } // namespace onert
 
 #endif // __ONERT_COMPILER_COMPILE_H_
diff --git a/runtime/onert/core/include/compiler/CompilerFactory.h b/runtime/onert/core/include/compiler/CompilerFactory.h
new file mode 100644
index 000000000..4894366a2
--- /dev/null
+++ b/runtime/onert/core/include/compiler/CompilerFactory.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_COMPILER_FACTORY_H__
+#define __ONERT_COMPILER_COMPILER_FACTORY_H__
+
+#include "ICompiler.h"
+#include "CompilerOptions.h"
+#include "ir/NNPkg.h"
+
+namespace onert
+{
+namespace compiler
+{
+
+// TODO Support register and use compiler plugin
+class CompilerFactory
+{
+public:
+  static CompilerFactory &get();
+
+public:
+  std::unique_ptr<ICompiler> create(const std::shared_ptr<ir::NNPkg> &nnpkg,
+                                    std::vector<std::unique_ptr<CompilerOptions>> &copts);
+
+private:
+  // It is not allowed to use CompilerFactory without get()
+  CompilerFactory() = default;
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_COMPILER_FACTORY_H__
diff --git a/runtime/onert/core/include/compiler/CompilerOptions.h b/runtime/onert/core/include/compiler/CompilerOptions.h
new file mode 100644
index 000000000..bbe15fc06
--- /dev/null
+++ b/runtime/onert/core/include/compiler/CompilerOptions.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_COMPILER_OPTIONS_H_
+#define __ONERT_COMPILER_COMPILER_OPTIONS_H_
+
+#include "ir/OpCode.h"
+#include "ir/Index.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace onert
+{
+namespace compiler
+{
+
+struct ManualSchedulerOptions
+{
+public:
+  void setBackendMap(const std::string &str);
+
+public:
+  std::string backend_for_all;
+  std::unordered_map<ir::OpCode, std::string> opcode_to_backend;
+  std::unordered_map<ir::OperationIndex, std::string> index_to_backend;
+};
+
+class CompilerOptions
+{
+public:
+  /**
+   * @brief   Set default values for CompilerOptions
+   * @return  Generated CompileOption
+   *
+   * @note    All these default values should not be fetched from Env
+   *          when we stop supporting Android NNAPI.
+   */
+  static std::unique_ptr<CompilerOptions> fromGlobalConfig();
+
+  /**
+   * @brief Allow to compute float32 using float16 data type
+   */
+  void enableToFp16() { fp16_enable = true; }
+
+  /**
+   * @brief Force default values of CompilerOptions for correct compilations
+   *
+   * @note  This should be called after CompilerOptions setting is finished
+   *        to prevent value overwriting
+   */
+  void forceInternalOptions();
+
+  /**
+   * @brief Print option value
+   */
+  void verboseOptions();
+
+public:
+  // GENERAL OPTIONS
+  std::vector<std::string> backend_list;
+
+  // OPTIONS ONLY FOR DEBUGGING/PROFILING
+  std::string trace_filepath; //< File path to save trace records
+  int graph_dump_level;       //< Graph dump level, values between 0 and 2 are valid
+  std::string executor;       //< Executor name to use
+  ManualSchedulerOptions manual_scheduler_options; //< Options for ManualScheduler
+  bool he_scheduler;      //< HEScheduler if true, ManualScheduler otherwise
+  bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF
+  bool fp16_enable;       //< Whether fp16 mode ON/OFF
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_COMPILER_OPTIONS_H_
diff --git a/runtime/onert/core/include/compiler/ICompiler.h b/runtime/onert/core/include/compiler/ICompiler.h
new file mode 100644
index 000000000..255e0509d
--- /dev/null
+++ b/runtime/onert/core/include/compiler/ICompiler.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file  ICompiler.h
+ * @brief This file contains ICompiler class to define and run compilation phase
+ */
+
+#ifndef __ONERT_COMPILER_I_COMPILER_H_
+#define __ONERT_COMPILER_I_COMPILER_H_
+
+#include "exec/IExecutors.h"
+#include "util/TracingCtx.h"
+
+namespace onert
+{
+namespace compiler
+{
+
+struct CompilerArtifact
+{
+  CompilerArtifact(void) = delete;
+  CompilerArtifact(std::shared_ptr<exec::IExecutors> executors,
+                   std::unique_ptr<const util::TracingCtx> tracing_ctx)
+    : _executors{executors}, _tracing_ctx{std::move(tracing_ctx)} {};
+
+  std::shared_ptr<exec::IExecutors> _executors;
+  std::unique_ptr<const util::TracingCtx> _tracing_ctx;
+};
+
+class ICompiler
+{
+public:
+  /**
+   * @brief Virtual ICompiler destructor
+   * @note  Require derived class destructor
+   */
+  virtual ~ICompiler() = default;
+
+  /**
+   * @brief   Do compilation
+   * @return  std::shared_ptr<CompilerArtifact> Executors as a result of compilation
+   */
+  virtual std::shared_ptr<CompilerArtifact> compile(void) = 0;
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_I_COMPILER_H_
diff --git a/runtime/onert/core/include/compiler/LoweredGraph.h b/runtime/onert/core/include/compiler/LoweredGraph.h
index 7264f2a10..e9f0ae0de 100644
--- a/runtime/onert/core/include/compiler/LoweredGraph.h
+++ b/runtime/onert/core/include/compiler/LoweredGraph.h
@@ -36,13 +36,9 @@ class LoweredGraph
 {
 public:
   LoweredGraph(const ir::Graph &graph, const compiler::CompilerOptions &options);
-  LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph,
-               const compiler::CompilerOptions &options);
 
   ir::Graph &graph() { return _graph; }
   const ir::Graph &graph() const { return _graph; }
-  ir::Graph &parent_graph() { return _parent_graph; }
-  const ir::Graph &parent_graph() const { return _parent_graph; }
   const compiler::GraphLowerInfo &lower_info() const { return _lower_info_map; }
   compiler::GraphLowerInfo &lower_info() { return _lower_info_map; }
   std::shared_ptr<ir::OperationIndexMap<int64_t>> indexed_ranks() { return _indexed_ranks; }
@@ -69,7 +65,6 @@ private:
    *          It allows the original graph can be compiled multiple times.
    */
   ir::Graph _graph;
-  ir::Graph _parent_graph;
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
   compiler::GraphLowerInfo _lower_info_map;
   ir::OperationIndexMap<bool> _has_dynamic_tensor_map;
diff --git a/runtime/onert/core/include/compiler/StaticShapeInferer.h b/runtime/onert/core/include/compiler/StaticShapeInferer.h
index f701dc207..94d6ba1a7 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInferer.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInferer.h
@@ -101,6 +101,15 @@ public:
 
   void dump();
 
+  /**
+   * @brief     Create a lowered model shape inferer map
+   * @param[in] lowered_subgs lowered model subgraph map
+   * @return    Shape inferer map
+   */
+  static std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>>
+  createStaticShapeInferers(
+    const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<LoweredGraph>> &lowered_subgs);
+
 private:
   bool checkDynamicInput(const ir::Operation &op);
   bool checkDynamicOutput(const ir::Operation &op);
diff --git a/runtime/onert/core/include/exec/Execution.h b/runtime/onert/core/include/exec/Execution.h
index 1e8083c4c..ba3edcdd6 100644
--- a/runtime/onert/core/include/exec/Execution.h
+++ b/runtime/onert/core/include/exec/Execution.h
@@ -22,7 +22,7 @@
 #define __ONERT_EXEC_EXECUTION_H__
 
 #include "ir/Layout.h"
-#include "exec/Executors.h"
+#include "exec/IExecutors.h"
 #include "IODescription.h"
 
 #include <thread>
@@ -46,16 +46,15 @@ public:
    * @brief     Construct a new Execution object
    * @param[in] executor  Model executor
    */
-  Execution(const std::shared_ptr<Executors> &executors);
+  Execution(const std::shared_ptr<IExecutors> &executors);
 
 public:
   /**
    * @brief   Returns primary graph object
    * @return  Graph object
    */
-  const ir::Graph &primary_subgraph() const { return primary_executor()->graph(); }
+  const ir::Graph &primary_subgraph() const { return entryExecutor()->graph(); }
 
-  const ir::Graph &primary_parentgraph() const { return primary_executor()->parent_graph(); }
   /**
    * @brief     Change input shape
    * @param[in] index   Input index
@@ -146,121 +145,15 @@ public:
   ir::Shape getInputShape(ir::IOIndex ind) const;
   ir::Shape getOutputShape(ir::IOIndex ind) const;
 
-  //
-  // Experimental API
-  //
-
-  // accessor
-  std::vector<
-    std::tuple<std::shared_ptr<onert::exec::Execution>, onert::ir::IOIndex, onert::ir::IOIndex>>
-  getNextExes()
-  {
-    return next_exes;
-  }
-  std::deque<std::pair<IODescription *, uint32_t>> *getAsyncIoDescs() { return &_async_io_descs; }
-  std::deque<std::vector<void *>> *getAsyncResults() { return &_async_results; }
-
-  /**
-   * @brief     Push IO information between related executions into next_exes
-   * @param[in] next   address of next execution
-   * @param[in] o_index  Output index of current execution (it will be the input of next execution)
-   * @param[in] i_index  Input index of next execution
-   */
-  void pushNextExe(std::shared_ptr<onert::exec::Execution> next, onert::ir::IOIndex o_index,
-                   onert::ir::IOIndex i_index)
-  {
-    next_exes.push_back({next, o_index, i_index});
-  }
-
-  /**
-   * @brief     Create New IODescription instance for new inputs outputs
-   * @param[in] index   instance count number
-   */
-  void createNewAsyncDesc(uint32_t count = 0);
-
-  /**
-   * @brief     Set async input data's information
-   * @param[in] index   Input index
-   * @param[in] buffer  Input data's buffer pointer
-   * @param[in] length  Input data's length
-   * @param[in] layout  Input data's data format
-   */
-  void executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length,
-                         ir::Layout layout = ir::Layout::NHWC);
-
-  /**
-   * @brief     Set async output data's information
-   * @param[in] index   Output index
-   * @param[in] buffer  Output data's buffer pointer
-   * @param[in] length  Output data's length
-   * @param[in] layout  Output data's data format
-   */
-  void executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length,
-                          ir::Layout layout = ir::Layout::NHWC);
-
-  /**
-   * @brief  Async execution
-   * @note   It should be called after setting input and output buffer
-   */
-  void AsyncExecute();
-
-  /**
-   * @brief   Set finish
-   */
-  void setFinish();
-
-  /**
-   * @brief   Check if input queue is empty
-   * @return  @c true if queue is empty, otherwise @c false
-   */
-  bool isEmptyQueue();
-
-  /**
-   * @brief   Wait semaphore to prevent race condition
-   */
-  void asyncIoDescSemWait();
-
-  /**
-   * @brief   Post semaphore to prevent race condition
-   */
-  void asyncIoDescSemPost();
-
-  /**
-   * @brief   Inference
-   * @note    this function provided to the thread for pipelining
-   */
-  void runInference();
-
-  /**
-   * @brief   Check if stop_wait is true
-   * @return  @c true if stop_wait is true, otherwise @c false
-   */
-  bool stopWait(void) const;
-
-  /**
-   * @brief   Set stop_wait to terminate consumer thread
-   */
-  void sholudStop();
-
 private:
-  const std::unique_ptr<IExecutor> &primary_executor() const
-  {
-    return _executors->at(ir::SubgraphIndex{0});
-  };
-  std::unique_ptr<IExecutor> &primary_executor() { return _executors->at(ir::SubgraphIndex{0}); };
+  const IExecutor *entryExecutor() const { return _executors->entryExecutor(); };
+  IExecutor *entryExecutor() { return _executors->entryExecutor(); };
 
 private:
-  const std::shared_ptr<Executors> _executors;
+  const std::shared_ptr<IExecutors> _executors;
   IODescription _io_desc;
-  std::deque<std::pair<IODescription *, uint32_t>> _async_io_descs;
-  sem_t _async_io_descs_sem;
-  std::deque<std::vector<void *>> _async_results;
-  std::vector<
-    std::tuple<std::shared_ptr<onert::exec::Execution>, onert::ir::IOIndex, onert::ir::IOIndex>>
-    next_exes;
   std::unique_ptr<std::thread> _exec_thread;
   bool finished{false};
-  bool stop_wait{false};
 };
 
 } // namespace exec
diff --git a/runtime/onert/core/include/exec/Executors.h b/runtime/onert/core/include/exec/Executors.h
deleted file mode 100644
index 5adb0eda4..000000000
--- a/runtime/onert/core/include/exec/Executors.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_EXEC_EXECUTORS_H__
-#define __ONERT_EXEC_EXECUTORS_H__
-
-#include "IExecutor.h"
-#include "ir/NNPkg.h"
-
-namespace onert
-{
-namespace exec
-{
-
-/**
- * @brief Class to gather executors
- */
-class Executors
-{
-public:
-  Executors(void) = default;
-  Executors(std::unique_ptr<ir::ModelEdges> model_edges) { _model_edges = std::move(model_edges); }
-  Executors(const Executors &) = delete;
-  Executors(Executors &&) = default;
-
-  // TODO Use Executor index
-  void emplace(ir::SubgraphIndex idx, std::unique_ptr<IExecutor> exec)
-  {
-    _executors.emplace(idx, std::move(exec));
-  }
-
-  std::unique_ptr<IExecutor> &at(ir::SubgraphIndex idx) { return _executors.at(idx); }
-
-  uint32_t inputSize() const;
-
-  uint32_t outputSize() const;
-
-  const ir::OperandInfo inputInfo(const ir::IOIndex &index);
-
-  const ir::OperandInfo outputInfo(const ir::IOIndex &index);
-
-  void execute(const IODescription &desc);
-
-private:
-  void executeEntries(const IODescription &desc);
-
-private:
-  // TODO Use Executor index
-  //      Changing index will effect if/while compile and kernel implementation
-  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>> _executors;
-  // NOTE _model_edges may use different struct type for executor implementation
-  std::unique_ptr<ir::ModelEdges> _model_edges;
-};
-
-} // namespace exec
-} // namespace onert
-
-#endif // __ONERT_EXEC_EXECUTORS_H__
diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h
index 7ff6d8b8c..a7020d425 100644
--- a/runtime/onert/core/include/exec/FunctionSequence.h
+++ b/runtime/onert/core/include/exec/FunctionSequence.h
@@ -66,7 +66,7 @@ public:
 
   template <typename T, typename... Args> void wrap(Args &&... args)
   {
-    for (auto &function : _functions)
+    for (auto &&function : _functions)
     {
       function = std::make_unique<T>(std::move(function), args...);
     }
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index bb5b5af98..46dbcd033 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -46,7 +46,6 @@ namespace onert
 {
 namespace exec
 {
-class IExecutionObserver;
 /**
  * @brief Struct to define interface of Executor
  */
@@ -66,14 +65,7 @@ struct IExecutor
    *
    * @return Graph object
    */
-  virtual const ir::Graph &graph() = 0;
-
-  /**
-   * @brief Returns parent graph object
-   *
-   * @return Graph object
-   */
-  virtual const ir::Graph &parent_graph() = 0;
+  virtual const ir::Graph &graph() const = 0;
 
   /**
    * @brief     Set an ordering on operations
@@ -99,6 +91,13 @@ struct IExecutor
   virtual void execute(const std::vector<backend::IPortableTensor *> &inputs,
                        const std::vector<backend::IPortableTensor *> &outputs) = 0;
 
+  /**
+   * @brief Get input tensor objects
+   *
+   * @return Vector of @c IOTensor
+   */
+  virtual const std::vector<backend::builtin::IOTensor *> &getInputTensors() const = 0;
+
   /**
    * @brief Get output tensor objects
    *
diff --git a/runtime/onert/core/include/exec/IExecutors.h b/runtime/onert/core/include/exec/IExecutors.h
new file mode 100644
index 000000000..013da716b
--- /dev/null
+++ b/runtime/onert/core/include/exec/IExecutors.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_EXEC_I_EXECUTORS_H__
+#define __ONERT_EXEC_I_EXECUTORS_H__
+
+#include "IExecutor.h"
+
+namespace onert
+{
+namespace exec
+{
+
+/**
+ * @brief Class to gather NN package's executor set
+ */
+class IExecutors
+{
+public:
+  /**
+   * @brief Virtual IExecutors destructor
+   * @note  Require derived class destructor
+   */
+  virtual ~IExecutors() = default;
+
+public:
+  /**
+   * @brief     Insert executor in executor set
+   * @param[in] model_index Model index
+   * @param[in] subg_index  Subgraph index
+   * @param[in] exec        Executor to insert
+   *
+   * @todo      Use Executor index
+   */
+  virtual void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+                       std::unique_ptr<IExecutor> exec) = 0;
+
+  /**
+   * @brief     Return executor of index
+   * @param[in] model_index Model index
+   * @param[in] subg_index  Subgraph index
+   * @return    Executor
+   */
+  virtual IExecutor *at(const ir::ModelIndex &model_index,
+                        const ir::SubgraphIndex &subg_index) const = 0;
+
+  IExecutor *entryExecutor() const { return at(ir::ModelIndex{0}, ir::SubgraphIndex{0}); }
+
+  /**
+   * @brief   Return executor set's number of input
+   * @return  Number of input
+   */
+  virtual uint32_t inputSize() const = 0;
+
+  /**
+   * @brief   Return executor set's number of output
+   * @return  Number of output
+   */
+  virtual uint32_t outputSize() const = 0;
+
+  /**
+   * @brief     Return NN package input tensor info
+   * @param[in] index Input index
+   * @return    Tensor info
+   */
+  virtual const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const = 0;
+
+  /**
+   * @brief     Return NN package output tensor info
+   * @param[in] index Output index
+   * @return    Tensor info
+   */
+  virtual const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const = 0;
+
+  /**
+   * @brief     Execute NN package executor set
+   * @param[in] desc  Input and output buffer description
+   */
+  virtual void execute(const IODescription &desc) = 0;
+};
+
+} // namespace exec
+} // namespace onert
+
+#endif // __ONERT_EXEC_I_EXECUTORS_H__
diff --git a/runtime/onert/core/include/ir/Graph.h b/runtime/onert/core/include/ir/Graph.h
index 286caf72f..1783cdca0 100644
--- a/runtime/onert/core/include/ir/Graph.h
+++ b/runtime/onert/core/include/ir/Graph.h
@@ -89,15 +89,6 @@ public:
   void verify(void);
   void removeOperand(const OperandIndex &ind) { _operands.remove(ind); }
   void setLayout(Layout layout) { _layout = layout; }
-  void setPartialModel(const std::shared_ptr<Model> &partial_model)
-  {
-    _partialgraphs = partial_model;
-  }
-  void
-  setTensorName(std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> &tensor_names)
-  {
-    _tensor_names = tensor_names;
-  }
 
 private:
   bool checkOperandsForOperation(const Operation &operation);
@@ -136,29 +127,6 @@ public:
   const Operations &operations() const { return _operations; }
   Operations &operations() { return _operations; }
   Layout layout() const { return _layout; }
-  std::shared_ptr<Model> &partialgraphs() { return _partialgraphs; }
-  std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> &tensor_names()
-  {
-    return _tensor_names;
-  }
-  std::unordered_map<std::string, IOIndex>::const_iterator _name_to_input_begin() const
-  {
-    return _name_to_input.begin();
-  }
-  std::unordered_map<std::string, IOIndex>::const_iterator _name_to_input_end() const
-  {
-    return _name_to_input.end();
-  }
-  std::unordered_map<std::string, IOIndex>::const_iterator _name_to_output_begin() const
-  {
-    return _name_to_output.begin();
-  }
-  std::unordered_map<std::string, IOIndex>::const_iterator _name_to_output_end() const
-  {
-    return _name_to_output.end();
-  }
-  void input_sort() { _inputs.sort(); }
-  void output_sort() { _outputs.sort(); }
 
   // Topological sort
 public:
@@ -173,10 +141,6 @@ private:
   std::unordered_map<std::string, IOIndex> _name_to_output;
   // TFLite and circle's default layout is NHWC;
   Layout _layout{Layout::NHWC};
-
-  // model for partial graphs
-  std::shared_ptr<ir::Model> _partialgraphs;
-  std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> _tensor_names;
 };
 
 } // namespace ir
diff --git a/runtime/onert/core/include/ir/Index.h b/runtime/onert/core/include/ir/Index.h
index f01a4c84d..1864c3bdb 100644
--- a/runtime/onert/core/include/ir/Index.h
+++ b/runtime/onert/core/include/ir/Index.h
@@ -36,10 +36,10 @@ struct IOIndexTag;
 using IOIndex = ::onert::util::Index<uint32_t, IOIndexTag>;
 
 struct SubgraphIndexTag;
-using SubgraphIndex = ::onert::util::Index<uint32_t, SubgraphIndexTag>;
+using SubgraphIndex = ::onert::util::Index<uint16_t, SubgraphIndexTag>;
 
 struct ModelIndexTag;
-using ModelIndex = ::onert::util::Index<uint32_t, ModelIndexTag>;
+using ModelIndex = ::onert::util::Index<uint16_t, ModelIndexTag>;
 
 template <typename IndexType>
 std::ostream &_index_print_impl(std::ostream &o, const std::string &prefix, IndexType index)
diff --git a/runtime/onert/core/include/ir/NNPkg.h b/runtime/onert/core/include/ir/NNPkg.h
index d9f825e85..b23745d55 100644
--- a/runtime/onert/core/include/ir/NNPkg.h
+++ b/runtime/onert/core/include/ir/NNPkg.h
@@ -21,6 +21,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "ir/Graph.h"
 #include "ir/Index.h"
 #include "ir/Model.h"
 
@@ -89,7 +90,7 @@ public:
   ~NNPkg() = default;
 
   NNPkg(std::shared_ptr<Model> model) { _models[ModelIndex{0}] = model; }
-  std::shared_ptr<Model> primary_model() { return _models.at(onert::ir::ModelIndex{0}); }
+  std::shared_ptr<Model> primary_model() const { return _models.at(onert::ir::ModelIndex{0}); }
 
   /**
    * @brief Put model at index
@@ -180,6 +181,91 @@ public:
    */
   const ModelEdges &model_edges() { return _edges; }
 
+  /**
+   * @brief Verify NNPkg
+   *
+   */
+  void verify(void)
+  {
+    // Verify edges information
+    //
+    // Only duplicates of nnpkg output and Edge `from` are possible.
+    // | Whether duplicates are possible   | Edge `to` | Edge `from` |
+    // | nnpkg input  (input of subgraph)  | X (*1)    | X (*2)      |
+    // | nnpkg output (output of subgraph) | X (*2)    | O           |
+    // *1. The subjects who determine values of each buffer are different.
+    //    - nnpkg input : user input
+    //    - Edge `to`   : output of another subgraph
+    // *2. `IOIndex` of inputs and outputs of subgraph is distinct.
+    //
+    for (const auto &edge : _edges.edges)
+    {
+      if (std::find(_edges.pkg_inputs.begin(), _edges.pkg_inputs.end(), edge.to) !=
+          _edges.pkg_inputs.end())
+      {
+        throw std::runtime_error{
+          "Invalid edge information. NNPkg inputs and Edge `to` cannot be duplicated"};
+      }
+    }
+  }
+
+  // TODO Find better way to handle single model NNPackage and multi model NNPackage on inputSize(),
+  //      outputSize(), inputInfo(), outputInfo()
+
+  /**
+   * @brief   Get model input size
+   */
+  uint32_t inputSize() const
+  {
+    return _models.size() == 1 ? primary_model()->primary_subgraph()->getInputs().size()
+                               : _edges.pkg_inputs.size();
+  }
+
+  /**
+   * @brief   Get model output size
+   */
+  uint32_t outputSize() const
+  {
+    return _models.size() == 1 ? primary_model()->primary_subgraph()->getOutputs().size()
+                               : _edges.pkg_outputs.size();
+  }
+
+  /**
+   * @brief   Get model input info
+   */
+  OperandInfo &inputInfo(uint32_t index) const
+  {
+    if (_models.size() == 1)
+    {
+      auto const graph = primary_model()->primary_subgraph();
+      auto const operand_index = graph->getInputs().at(index);
+      return graph->operands().at(operand_index).info();
+    }
+
+    auto const &desc = input(index);
+    auto const graph = model(std::get<ModelIndex>(desc))->primary_subgraph();
+    auto const operand_index = graph->getInputs().at(std::get<IOIndex>(desc).value());
+    return graph->operands().at(operand_index).info();
+  }
+
+  /**
+   * @brief   Get model output info
+   */
+  OperandInfo &outputInfo(uint32_t index) const
+  {
+    if (_models.size() == 1)
+    {
+      auto const graph = primary_model()->primary_subgraph();
+      auto const operand_index = graph->getOutputs().at(index);
+      return graph->operands().at(operand_index).info();
+    }
+
+    auto const &desc = output(index);
+    auto const graph = model(std::get<ModelIndex>(desc))->primary_subgraph();
+    auto const operand_index = graph->getOutputs().at(std::get<IOIndex>(desc).value());
+    return graph->operands().at(operand_index).info();
+  }
+
   // TODO: Add iterate() or getter for edges
 
 private:
@@ -190,4 +276,18 @@ private:
 } // namespace ir
 } // namespace onert
 
+namespace std
+{
+
+template <> struct hash<onert::ir::IODesc>
+{
+  size_t operator()(const ::onert::ir::IODesc &iodesc) const noexcept
+  {
+    return (std::get<0>(iodesc).value() << 24) | (std::get<1>(iodesc).value() << 16) |
+           std::get<2>(iodesc).value();
+  }
+};
+
+} // namespace std
+
 #endif // __ONERT_IR_NNPKG_H__
diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h
index dd390748b..846c3f950 100644
--- a/runtime/onert/core/include/ir/OperandIndexSequence.h
+++ b/runtime/onert/core/include/ir/OperandIndexSequence.h
@@ -19,7 +19,6 @@
 
 #include <initializer_list>
 #include <vector>
-#include <algorithm>
 
 #include "ir/Index.h"
 
@@ -46,12 +45,6 @@ public:
   void append(const OperandIndex &index) { _vec.emplace_back(index); }
   void append(const OperandIndexSequence &l) { _vec.insert(_vec.end(), l.begin(), l.end()); }
 
-  void sort()
-  {
-    std::sort(_vec.begin(), _vec.end(),
-              [](const auto &lhs, const auto &rhs) { return lhs.value() < rhs.value(); });
-  }
-
 public:
   uint32_t size() const { return static_cast<uint32_t>(_vec.size()); }
   const OperandIndex &at(IOIndex set_index) const { return _vec.at(set_index.value()); }
diff --git a/runtime/onert/core/include/ir/Shape.h b/runtime/onert/core/include/ir/Shape.h
index ec6dd07af..cf84e2626 100644
--- a/runtime/onert/core/include/ir/Shape.h
+++ b/runtime/onert/core/include/ir/Shape.h
@@ -70,8 +70,8 @@ struct FeatureShape
 struct Shape
 {
 public:
-  static int32_t const UNSPECIFIED_DIM;
-  static int32_t const MAX_RANK;
+  static int32_t const kUnspecifiedDim;
+  static int32_t const kMaxRank;
 
   Shape() = default;
 
@@ -126,7 +126,7 @@ public:
    */
   bool hasUnspecifiedDims() const
   {
-    return (std::find(_dimensions.begin(), _dimensions.end(), UNSPECIFIED_DIM) !=
+    return (std::find(_dimensions.begin(), _dimensions.end(), kUnspecifiedDim) !=
             _dimensions.end());
   }
 
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index 4bbc02ac3..b9bad1b59 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -23,7 +23,6 @@ CONFIG(GRAPH_DOT_DUMP          , int          , "0")
 CONFIG(BACKENDS                , std::string  , "cpu;acl_cl;acl_neon;ruy;xnnpack;gpu_cl;trix;bcq") // FIXME Remove bcq
 CONFIG(OP_BACKEND_ALLOPS       , std::string  , "")
 CONFIG(OP_BACKEND_MAP          , std::string  , "")
-CONFIG(DISABLE_COMPILE         , bool         , "0")
 CONFIG(ONERT_LOG_ENABLE        , bool         , "0")
 CONFIG(CPU_MEMORY_PLANNER      , std::string  , "WIC")
 CONFIG(EXECUTOR                , std::string  , "Linear")
diff --git a/runtime/onert/core/include/util/Index.h b/runtime/onert/core/include/util/Index.h
index d3f3dcb46..49c5f4c6d 100644
--- a/runtime/onert/core/include/util/Index.h
+++ b/runtime/onert/core/include/util/Index.h
@@ -138,6 +138,13 @@ public:
    */
   T value() const { return _index; }
 
+  /**
+   * @brief Return max index value
+   *
+   * @return Maximum valid index value
+   */
+  static T max() { return UNDEFINED - 1; }
+
 private:
   T _index;
 };
diff --git a/runtime/onert/core/include/util/ObjectManager.h b/runtime/onert/core/include/util/ObjectManager.h
index 36b6c85c8..077a4c2ef 100644
--- a/runtime/onert/core/include/util/ObjectManager.h
+++ b/runtime/onert/core/include/util/ObjectManager.h
@@ -202,12 +202,12 @@ public:
     // This implementation is a workaround in case of adding operands while iteration
     std::list<Index> l;
 
-    for (auto &e : _objects)
+    for (const auto &e : _objects)
     {
       l.push_back(e.first);
     }
 
-    for (auto &index : l)
+    for (const auto &index : l)
     {
       fn(index, *_objects[index]);
     }
diff --git a/runtime/onert/core/include/util/Utils.h b/runtime/onert/core/include/util/Utils.h
index 8a4eea32b..505f5a9b3 100644
--- a/runtime/onert/core/include/util/Utils.h
+++ b/runtime/onert/core/include/util/Utils.h
@@ -29,9 +29,9 @@
 
 template <size_t from, size_t to, typename Enable = void> struct ForEachDimension
 {
-  template <typename L, typename... Args>
+  template <typename L>
   static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
-                     L &&lambda_function, Args &&... args)
+                     L lambda_function)
   {
     static_assert(from < to, "from must not be less than to");
     assert(static_cast<int>(to) <= shape.rank());
@@ -40,8 +40,7 @@ template <size_t from, size_t to, typename Enable = void> struct ForEachDimensio
     for (auto v = 0; v < d; v++)
     {
       coords.set(from, v);
-      ForEachDimension<from + 1, to>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                             std::forward<Args>(args)...);
+      ForEachDimension<from + 1, to>::unroll(shape, coords, lambda_function);
     }
   }
 };
@@ -49,18 +48,17 @@ template <size_t from, size_t to, typename Enable = void> struct ForEachDimensio
 template <size_t from, size_t to>
 struct ForEachDimension<from, to, typename std::enable_if<from == to>::type>
 {
-  template <typename L, typename... Args>
+  template <typename L>
   static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords,
-                     L &&lambda_function, Args &&... args)
+                     L lambda_function)
   {
     UNUSED_RELEASE(shape);
     assert(static_cast<int>(to) <= shape.rank());
-    lambda_function(coords, std::forward<Args>(args)...);
+    lambda_function(coords);
   }
 };
 
-template <typename L, typename... Args>
-inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &&... args)
+template <typename L> inline void ShapeLoop(const onert::ir::Shape &shape, L lambda_function)
 {
   assert(shape.rank() > 0);
   for (auto i = 0; i < shape.rank(); ++i)
@@ -73,32 +71,25 @@ inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &
   {
     case 0:
       coords.set(0, 0);
-      ForEachDimension<0, 0>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 0>::unroll(shape, coords, lambda_function);
       break;
     case 1:
-      ForEachDimension<0, 1>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 1>::unroll(shape, coords, lambda_function);
       break;
     case 2:
-      ForEachDimension<0, 2>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 2>::unroll(shape, coords, lambda_function);
       break;
     case 3:
-      ForEachDimension<0, 3>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 3>::unroll(shape, coords, lambda_function);
       break;
     case 4:
-      ForEachDimension<0, 4>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 4>::unroll(shape, coords, lambda_function);
       break;
     case 5:
-      ForEachDimension<0, 5>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 5>::unroll(shape, coords, lambda_function);
       break;
     case 6:
-      ForEachDimension<0, 6>::unroll(shape, coords, std::forward<L>(lambda_function),
-                                     std::forward<Args>(args)...);
+      ForEachDimension<0, 6>::unroll(shape, coords, lambda_function);
       break;
     default:
       assert(false && "ShapeLoop, 1 <= Shape'rank <= 6");
diff --git a/runtime/onert/core/src/backend/basic/MemoryManager.cc b/runtime/onert/core/src/backend/basic/MemoryManager.cc
index c468ee458..05fd9cc77 100644
--- a/runtime/onert/core/src/backend/basic/MemoryManager.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryManager.cc
@@ -94,7 +94,7 @@ void DynamicMemoryManager::deallocate(const ITensor *tensor)
 
 void DynamicMemoryManager::deallocate(void)
 {
-  for (auto &mem_alloc : _mem_alloc_map)
+  for (auto &&mem_alloc : _mem_alloc_map)
   {
     // Release memory buffer of mem_alloc
     mem_alloc.second->release();
diff --git a/runtime/onert/core/src/backend/basic/MemoryPlanner.cc b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc
index 1fda57b3d..1c048043c 100644
--- a/runtime/onert/core/src/backend/basic/MemoryPlanner.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc
@@ -58,7 +58,7 @@ void FirstFitPlanner::claim(const ir::OperandIndex &ind, size_t size)
 {
   // Find the right position for claiming
   uint32_t next_offset = 0;
-  for (auto &mem_claim : _claim_table)
+  for (const auto &mem_claim : _claim_table)
   {
     auto claimed_base_offset = mem_claim.first;
     auto claimed_size = _mem_plans[mem_claim.second].size;
diff --git a/runtime/onert/core/src/backend/basic/StaticTensorManager.cc b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc
index d891814fa..b03eb607c 100644
--- a/runtime/onert/core/src/backend/basic/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc
@@ -39,7 +39,7 @@ void StaticTensorManager::allocateNonconsts(void)
 {
   _nonconst_mgr->allocate();
 
-  for (auto &pair : _tensors->native_tensors())
+  for (auto &&pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
     auto tensor = pair.second.get();
diff --git a/runtime/onert/core/src/backend/builtin/BackendContext.cc b/runtime/onert/core/src/backend/builtin/BackendContext.cc
index 8a6cddcfb..c1a2ed537 100644
--- a/runtime/onert/core/src/backend/builtin/BackendContext.cc
+++ b/runtime/onert/core/src/backend/builtin/BackendContext.cc
@@ -44,7 +44,7 @@ FunctionMap BackendContext::genKernels()
   const_cast<ir::Graph *>(graph())->operands().iterate(
     [&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
-  for (auto &it : ret)
+  for (auto &&it : ret)
   {
     auto &fn_seq = it.second;
     fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
diff --git a/runtime/onert/core/src/backend/builtin/IOTensor.h b/runtime/onert/core/src/backend/builtin/IOTensor.h
index a1b2064a1..d94ed0bca 100644
--- a/runtime/onert/core/src/backend/builtin/IOTensor.h
+++ b/runtime/onert/core/src/backend/builtin/IOTensor.h
@@ -47,7 +47,7 @@ public:
 public:
   void setTensor(IPortableTensor *tensor);
   void setUserTensor(uint8_t *buffer, size_t size);
-  ir::OperandInfo orig_info() const { return _orig_info; }
+  const ir::OperandInfo &orig_info() const { return _orig_info; }
   ir::Layout orig_layout() const { return _orig_layout; }
 
 public:
diff --git a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc
index fa2fc0b94..4533703a6 100644
--- a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc
@@ -33,8 +33,8 @@ KernelGenerator::KernelGenerator(const ir::Graph &graph, DynamicTensorManager *d
                                  const std::shared_ptr<TensorRegistry> &tensor_reg,
                                  const std::shared_ptr<ExternalContext> &external_context)
   : basic::KernelGeneratorBase{graph}, _dyn_tensor_manager{dyn_tensor_manager},
-    _tensor_reg{tensor_reg}, _tensor_registries{}, _executors{nullptr}, _external_context{
-                                                                          external_context}
+    _tensor_reg{tensor_reg}, _tensor_registries{}, _executors{nullptr}, _model_index{},
+    _external_context{external_context}
 {
   UNUSED_RELEASE(_graph);
   UNUSED_RELEASE(_tensor_registries);
@@ -90,7 +90,7 @@ void KernelGenerator::visit(const ir::operation::If &node)
   input_tensors.erase(input_tensors.begin());
   auto fn = std::make_unique<::onert::backend::builtin::kernel::IfLayer>(
     cond_tensor, input_tensors, output_tensors, then_subg_index, else_subg_index, _executors,
-    _external_context);
+    _model_index, _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -133,7 +133,7 @@ void KernelGenerator::visit(const ir::operation::While &node)
   // WhileLayer just set Executors instead of cond and body executor to avoid complexity of
   // creating executor recusively
   auto fn = std::make_unique<::onert::backend::builtin::kernel::WhileLayer>(
-    input_tensors, output_tensors, cond_subg_index, body_subg_index, _executors,
+    input_tensors, output_tensors, cond_subg_index, body_subg_index, _executors, _model_index,
     _dyn_tensor_manager->dynamic_mem_mgr().get(), _external_context);
 
   _return_fn = std::move(fn);
diff --git a/runtime/onert/core/src/backend/builtin/KernelGenerator.h b/runtime/onert/core/src/backend/builtin/KernelGenerator.h
index d5931ca26..3c86fe306 100644
--- a/runtime/onert/core/src/backend/builtin/KernelGenerator.h
+++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.h
@@ -23,7 +23,7 @@
 #include "../../compiler/TensorRegistries.h"
 
 #include "backend/basic/KernelGeneratorBase.h"
-#include "exec/Executors.h"
+#include "exec/IExecutors.h"
 #include "ir/Graph.h"
 
 namespace onert
@@ -44,12 +44,14 @@ public:
   {
     _tensor_registries = tensor_registries;
   }
-  void setExecutors(const std::shared_ptr<exec::Executors> &executors)
+  void setExecutors(const std::shared_ptr<exec::IExecutors> &executors)
   {
     // FIXME Using shared_ptr's raw pointer!
     _executors = executors.get();
   }
 
+  void setModelIndex(const ir::ModelIndex &index) { _model_index = index; }
+
   std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
 
 private:
@@ -65,7 +67,8 @@ private:
   DynamicTensorManager *_dyn_tensor_manager;
   std::shared_ptr<TensorRegistry> _tensor_reg;
   compiler::TensorRegistries _tensor_registries;
-  exec::Executors *_executors;
+  exec::IExecutors *_executors;
+  ir::ModelIndex _model_index;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc
index cdb41960a..51bc5a8f2 100644
--- a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc
@@ -29,11 +29,11 @@ IfLayer::IfLayer(backend::IPortableTensor *cond_tensor,
                  const std::vector<backend::IPortableTensor *> input_tensors,
                  const std::vector<backend::IPortableTensor *> output_tensors,
                  const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
-                 exec::Executors *executors,
+                 exec::IExecutors *executors, const ir::ModelIndex &model_index,
                  const std::shared_ptr<ExternalContext> &external_context)
   : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
     _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index}, _executors{executors},
-    _external_context{external_context}
+    _model_index{model_index}, _external_context{external_context}
 {
   // At this point, executors may not have executors of then subg and else subg
 }
@@ -61,12 +61,12 @@ void IfLayer::run()
   if (cond_result)
   {
     VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl;
-    subg_exec = _executors->at(_then_subg_index).get();
+    subg_exec = _executors->at(_model_index, _then_subg_index);
   }
   else
   {
     VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl;
-    subg_exec = _executors->at(_else_subg_index).get();
+    subg_exec = _executors->at(_model_index, _else_subg_index);
   }
 
   subg_exec->execute(_input_tensors, _output_tensors);
diff --git a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h
index fa5537a67..8f639ced9 100644
--- a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h
+++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h
@@ -18,7 +18,7 @@
 #define __ONERT_BACKEND_BUILTIN_KERNEL_IF_LAYER_H__
 
 #include <backend/IPortableTensor.h>
-#include <exec/Executors.h>
+#include <exec/IExecutors.h>
 #include "../ExternalContext.h"
 
 namespace onert
@@ -37,7 +37,8 @@ public:
           const std::vector<backend::IPortableTensor *> input_tensors,
           const std::vector<backend::IPortableTensor *> output_tensors,
           const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
-          exec::Executors *executors, const std::shared_ptr<ExternalContext> &external_context);
+          exec::IExecutors *executors, const ir::ModelIndex &model_index,
+          const std::shared_ptr<ExternalContext> &external_context);
 
 public:
   void run() override;
@@ -48,7 +49,8 @@ private:
   const std::vector<backend::IPortableTensor *> _output_tensors;
   const ir::SubgraphIndex _then_subg_index;
   const ir::SubgraphIndex _else_subg_index;
-  exec::Executors *_executors;
+  exec::IExecutors *_executors;
+  ir::ModelIndex _model_index;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc
index ddaecdf57..600180077 100644
--- a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc
@@ -64,7 +64,7 @@ void PermuteLayer::optimize()
       src_offsets_it->resize(0);
       dst_offsets_it->resize(0);
       if (underlying_type(src->data_type()) != underlying_type(dst->data_type()))
-        throw std::runtime_error("data type does not match");
+        continue;
       const auto permute_type = [&]() -> PermuteType {
         if (src->getShape().rank() == 4 && src->layout() == ir::Layout::NHWC &&
             dst->layout() == ir::Layout::NCHW)
@@ -81,6 +81,8 @@ void PermuteLayer::optimize()
           return PermuteType::COPY;
         }
       }();
+
+      // TODO Support different types
       auto fn = [&](backend::ITensor &src_tensor) {
         dst->access([&](backend::ITensor &dst_tensor) {
           // NOTE The buffer of both tensor can be nullptr in this step
@@ -260,8 +262,10 @@ void PermuteLayer::run()
         // 1. The tasks for multithreathing was created
         // 2. The tasks's size > 1
         // 3. Both tensors are not dynamic
+        // 4. Data types of both tensors are different
         if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 ||
-            src->is_dynamic() || dst->is_dynamic())
+            src->is_dynamic() || dst->is_dynamic() ||
+            underlying_type(src->data_type()) != underlying_type(dst->data_type()))
         {
           permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets);
         }
diff --git a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc
index 8e006c5ea..c0ca4046c 100644
--- a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc
@@ -35,12 +35,14 @@ namespace kernel
 WhileLayer::WhileLayer(const std::vector<backend::IPortableTensor *> input_tensors,
                        const std::vector<backend::IPortableTensor *> output_tensors,
                        const ir::SubgraphIndex &cond_subg_index,
-                       const ir::SubgraphIndex &body_subg_index, exec::Executors *executors,
+                       const ir::SubgraphIndex &body_subg_index, exec::IExecutors *executors,
+                       const ir::ModelIndex &model_index,
                        basic::DynamicMemoryManager *dyn_memory_manager,
                        const std::shared_ptr<ExternalContext> &external_context)
   : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
     _input_tensors{input_tensors}, _output_tensors{output_tensors}, _executors{executors},
-    _dyn_memory_manager{dyn_memory_manager}, _external_context{external_context}
+    _model_index{model_index}, _dyn_memory_manager{dyn_memory_manager}, _external_context{
+                                                                          external_context}
 {
   // At this point, executors may not have executors of cond subg and body subg
 }
@@ -57,8 +59,8 @@ void WhileLayer::run()
   // // Run cond subg
   // If there is no loop copy "_input_tensors" -> "_dst_tensors", else copy "cond subg inputs" ->
   // "_dst_tensors"
-  auto cond_exec = _executors->at(_cond_subg_index).get();
-  auto body_exec = _executors->at(_body_subg_index).get();
+  auto cond_exec = _executors->at(_model_index, _cond_subg_index);
+  auto body_exec = _executors->at(_model_index, _body_subg_index);
 
   // Need a temp tensor to hold the cond subgraph output
   assert(cond_exec->getOutputTensors().size() == 1);
diff --git a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h
index 8551b3d09..40ca4fe23 100644
--- a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h
+++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h
@@ -18,7 +18,7 @@
 #define __ONERT_BACKEND_BUILTIN_KERNEL_WHILE_LAYER_H__
 
 #include <backend/IPortableTensor.h>
-#include <exec/Executors.h>
+#include <exec/IExecutors.h>
 #include <exec/IFunction.h>
 #include <ir/OperandIndexSequence.h>
 #include <ir/Graph.h>
@@ -41,7 +41,8 @@ public:
   WhileLayer(const std::vector<backend::IPortableTensor *> input_tensors,
              const std::vector<backend::IPortableTensor *> output_tensors,
              const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index,
-             exec::Executors *executors, basic::DynamicMemoryManager *dyn_memory_manager,
+             exec::IExecutors *executors, const ir::ModelIndex &model_index,
+             basic::DynamicMemoryManager *dyn_memory_manager,
              const std::shared_ptr<ExternalContext> &external_context);
 
 public:
@@ -52,7 +53,8 @@ private:
   const ir::SubgraphIndex _body_subg_index;
   const std::vector<backend::IPortableTensor *> _input_tensors;
   const std::vector<backend::IPortableTensor *> _output_tensors;
-  exec::Executors *_executors;
+  exec::IExecutors *_executors;
+  const ir::ModelIndex _model_index;
   basic::DynamicMemoryManager *_dyn_memory_manager; // For generating temp tensors
   const std::shared_ptr<ExternalContext> _external_context;
 };
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index 7be9c1e3b..45124556b 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -22,543 +22,96 @@
 #include "pass/OddOutputPass.h"
 #include "pass/PassRunner.h"
 #include "pass/UnusedOperandEliminationPass.h"
-#include "../backend/builtin/Config.h"
 #include "../dumper/dot/DotDumper.h"
-#include "../interp/InterpExecutor.h"
-#include "../ir/OperationCloner.h"
+#include "../exec/SingleModelExecutors.h"
 #include "../ir/OperationDumper.h"
 #include "../ir/verifier/Verifier.h"
 
 #include "compiler/StaticShapeInferer.h"
-#include "util/ConfigSource.h"
-#include "util/logging.h"
 
-#include <misc/polymorphic_downcast.h>
 #include <misc/string_helpers.h>
-#include <json/json.h>
-
-// TODO Remove using fstream header
-#include <fstream>
-
-namespace
-{
-
-using namespace onert;
-
-std::string getOpBackends(std::unordered_map<ir::OpCode, std::string> &opcode_to_backend)
-{
-  std::unordered_map<ir::OpCode, std::string>::iterator it;
-  std::string opbackends;
-
-  for (it = opcode_to_backend.begin(); it != opcode_to_backend.end(); ++it)
-  {
-    if (!opbackends.empty())
-      opbackends = opbackends + ", ";
-
-    auto opcode = it->first;
-    const std::string opname = ir::toString(opcode);
-    opbackends += opname + "=" + it->second;
-  }
-  return opbackends;
-}
-
-void verboseOptions(compiler::CompilerOptions &options)
-{
-  VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl;
-  VERBOSE(Compiler) << "backend_list             : "
-                    << nnfw::misc::join(options.backend_list.begin(), options.backend_list.end(),
-                                        "/")
-                    << std::endl;
-  VERBOSE(Compiler) << "trace_filepath           : " << options.trace_filepath << std::endl;
-  VERBOSE(Compiler) << "graph_dump_level         : " << options.graph_dump_level << std::endl;
-  VERBOSE(Compiler) << "executor                 : " << options.executor << std::endl;
-  VERBOSE(Compiler) << "manual backend_for_all   : "
-                    << options.manual_scheduler_options.backend_for_all << std::endl;
-  VERBOSE(Compiler) << "manual_scheduler_options : "
-                    << getOpBackends(options.manual_scheduler_options.opcode_to_backend)
-                    << std::endl;
-  VERBOSE(Compiler) << "he_scheduler             : " << options.he_scheduler << std::endl;
-  VERBOSE(Compiler) << "he_profiling_mode        : " << options.he_profiling_mode << std::endl;
-  VERBOSE(Compiler) << "disable_compile          : " << options.disable_compile << std::endl;
-  VERBOSE(Compiler) << "fp16_enable              : " << options.fp16_enable << std::endl
-                    << std::noboolalpha;
-}
-
-std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::StaticShapeInferer>>
-createStaticShapeInferers(
-  const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
-    &lowered_subgs)
-{
-  // Allocate StaticShapeInferer per each subgraph
-  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::StaticShapeInferer>> inferers;
-  for (auto &pair : lowered_subgs)
-  {
-    const auto &subg_index = pair.first;
-    auto &lowered_subg = pair.second;
-    inferers[subg_index] = std::make_unique<compiler::StaticShapeInferer>(lowered_subg.get());
-  }
-
-  // Append observers in all StaticShapeInferers
-  for (auto &pair : lowered_subgs)
-  {
-    const auto &subg_index = pair.first;
-    auto &lowered_subg = pair.second;
-
-    // TODO: Change this iteration for all to controlflow iteration
-    lowered_subg->graph().operations().iterate([&](const ir::OperationIndex &,
-                                                   const ir::Operation &op) {
-      // A Function to append child inferers. These make it possible for a StaticShapeInferer to
-      // call StaticShapeInferes of child subgraphs recursively
-      auto appendChildInferer = [&](const ir::SubgraphIndex &child_subg_idx) {
-        auto *child_inferer = inferers.at(child_subg_idx).get();
-        inferers.at(subg_index)->appendChildInferer(child_subg_idx, child_inferer);
-      };
-
-      // A Function to appaend subg input observers. This makes it possible for a StaticShapeInferer
-      // to update inputs of child subgraphs
-      auto appendSubgraphInputObserver = [&](const ir::SubgraphIndex &child_subg_idx) {
-        std::vector<ir::Operand *> child_subg_inputs;
-        auto &child_subg = lowered_subgs.at(child_subg_idx)->graph();
-        for (const auto &input_idx : child_subg.getInputs())
-        {
-          auto operand_ptr = child_subg.operands().getRawPtr(input_idx);
-          child_subg_inputs.emplace_back(operand_ptr);
-        }
-        inferers.at(subg_index)
-          ->appendSubgInputObserver(child_subg_idx,
-                                    std::make_unique<compiler::OperandObserver>(child_subg_inputs));
-      };
-
-      // A Function to set controlflow output observers. This makes it possible for a
-      // StaticShapeInferer to update outputs of parent controlflow opeerations
-      auto setControlFlowOutputObserver = [&](const ir::SubgraphIndex &child_subg_idx) {
-        std::vector<ir::Operand *> cf_outputs;
-        auto &subg = lowered_subg->graph();
-        for (const auto &output_idx : op.getOutputs())
-        {
-          auto operand_ptr = subg.operands().getRawPtr(output_idx);
-          cf_outputs.emplace_back(operand_ptr);
-        }
-        inferers.at(child_subg_idx)
-          ->setControlflowOutputObserver(std::make_unique<compiler::OperandObserver>(cf_outputs));
-      };
-
-      // Append Observers in a StaticShapeInferer
-      if (op.opcode() == ir::OpCode::If)
-      {
-        const auto &if_op = nnfw::misc::polymorphic_downcast<const ir::operation::If &>(op);
-
-        appendChildInferer(if_op.param().then_subg_index);
-        appendChildInferer(if_op.param().else_subg_index);
-
-        appendSubgraphInputObserver(if_op.param().then_subg_index);
-        appendSubgraphInputObserver(if_op.param().else_subg_index);
-
-        setControlFlowOutputObserver(if_op.param().then_subg_index);
-      }
-      else if (op.opcode() == ir::OpCode::While)
-      {
-        const auto &while_op = nnfw::misc::polymorphic_downcast<const ir::operation::While &>(op);
-
-        appendChildInferer(while_op.param().cond_subg_index);
-        appendChildInferer(while_op.param().body_subg_index);
-
-        appendSubgraphInputObserver(while_op.param().cond_subg_index);
-        appendSubgraphInputObserver(while_op.param().body_subg_index);
-
-        setControlFlowOutputObserver(while_op.param().body_subg_index);
-      }
-    });
-  }
-
-  return inferers;
-}
-
-} // namespace
 
 namespace onert
 {
-
 namespace compiler
 {
-void ManualSchedulerOptions::setBackendMap(const std::string &str)
-{
-  // TODO Support multiple subgraphs for manual scheduling
-  auto key_val_list = nnfw::misc::split(str, ';');
-  for (const auto &key_val_str : key_val_list)
-  {
-    if (key_val_str.empty())
-    {
-      continue;
-    }
-
-    auto key_val = nnfw::misc::split(key_val_str, '=');
-    const auto &key_str = key_val.at(0);
-    const auto &val = key_val.at(1);
-    auto key = static_cast<uint32_t>(std::stoi(key_str));
-    this->index_to_backend.emplace(ir::OperationIndex{key}, val);
-  }
-}
-
-std::unique_ptr<CompilerOptions> CompilerOptions::fromGlobalConfig()
-{
-  auto o = std::make_unique<CompilerOptions>();
-  o->backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';');
-  o->trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH);
-  o->graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP);
-  o->executor = util::getConfigString(util::config::EXECUTOR);
-  o->he_scheduler = util::getConfigBool(util::config::USE_SCHEDULER);
-  o->he_profiling_mode = util::getConfigBool(util::config::PROFILING_MODE);
-  o->disable_compile = util::getConfigBool(util::config::DISABLE_COMPILE);
-  o->fp16_enable = util::getConfigBool(util::config::FP16_ENABLE);
-  {
-    // Backend for all
-    auto &ms_options = o->manual_scheduler_options;
-
-    // Default value for op_backend_all is first element in the backend list
-    ms_options.backend_for_all = util::getConfigString(util::config::OP_BACKEND_ALLOPS);
-
-// Opcode to Backend
-#define OP(OpName)                                                                      \
-  {                                                                                     \
-    const auto &backend_str = util::getConfigString(util::config::OP_BACKEND_##OpName); \
-    if (!backend_str.empty())                                                           \
-    {                                                                                   \
-      ms_options.opcode_to_backend[ir::OpCode::OpName] = backend_str;                   \
-    }                                                                                   \
-  }
-#include "ir/Operations.lst"
-#undef OP
-
-    // Index to Backend
-    auto map_str = util::getConfigString(util::config::OP_BACKEND_MAP);
-    ms_options.setBackendMap(map_str);
-  }
-  return o;
-}
 
 Compiler::Compiler(const std::shared_ptr<ir::Model> &model, CompilerOptions &copt)
-  : _nnpkg{std::make_shared<ir::NNPkg>(model)}, _state{State::CREATED}, _voptions{&copt}
+  : _model{model}, _options{&copt}
 {
   // DO NOTHING
 }
 
 Compiler::Compiler(const std::shared_ptr<ir::NNPkg> &nnpkg,
                    std::vector<std::unique_ptr<CompilerOptions>> &copts)
-  : _nnpkg{nnpkg}, _state{State::CREATED}, _voptions{}
+  : _model{nnpkg->primary_model()}, _options{copts[0].get()}
 {
-  for (uint32_t i = 0; i < copts.size(); i++)
-  {
-    _voptions.push_back(copts[i].get());
-  }
-}
-
-void Compiler::enableToFp16()
-{
-  for (auto options : _voptions)
-    options->fp16_enable = true;
-}
-
-void Compiler::checkProfilerConditions()
-{
-  if (_nnpkg->model_count() != 1)
-    throw std::runtime_error("NYI: Profiling mode for multiple model is not supported yet");
-
-  auto &options = *_voptions[0];
-
-  if (options.he_scheduler)
-    throw std::runtime_error("Heterogeneous scheduler must be enabled during profiling.");
-
-  if (options.executor != "Dataflow")
-    throw std::runtime_error("Profiling mode works only with 'Dataflow' executor");
-}
-
-bool Compiler::buildPartialGraph(uint32_t num_graphs)
-{
-  // Use 1st model and options only on partial graph (pipeline) compile
-  assert(_nnpkg->model_count() == 1);
-  assert(_voptions.size() == 1);
-
-  auto model = _nnpkg->primary_model();
-  auto &options = *_voptions[0];
-
-  if (model->subgraphs_count() > 1)
-    return false;
-
-  auto partialgraphs = std::make_shared<ir::Model>();
-
-  for (uint32_t idx = 0; idx < num_graphs; idx++)
-  {
-    auto partialgraph = std::make_unique<ir::Graph>();
-    partialgraphs->push(ir::SubgraphIndex{idx}, std::move(partialgraph));
-  }
-  model->primary_subgraph()->setPartialModel(partialgraphs);
-
-  auto partial_graph = primary_subgraph()->partialgraphs();
-
-  primary_subgraph()->operands().iterate(
-    [&](const ir::OperandIndex &operand_index, const ir::Operand &operand) {
-      auto use_operations = operand.getUses();
-
-      for (auto use_operation : use_operations)
-      {
-        auto graph_index = options.partial_graph_options.index_to_graph.find(use_operation);
-        if (graph_index == options.partial_graph_options.index_to_graph.end())
-        {
-          throw std::runtime_error("Invalid Partition Map");
-        }
-        auto partition = partial_graph->at(graph_index->second);
-
-        if (partition->operands().exist(operand_index))
-        {
-          continue;
-        }
-
-        auto new_operand = std::make_unique<ir::Operand>(operand);
-        new_operand->clearDefUse();
-        auto new_operand_ind = partition->addOperand(operand_index, std::move(new_operand));
-        UNUSED_RELEASE(new_operand_ind);
-        assert(new_operand_ind == operand_index);
-      }
-    });
-
-  primary_subgraph()->operations().iterate(
-    [&](const ir::OperationIndex &operation_index, const ir::Operation &operation) {
-      auto graph_index = options.partial_graph_options.index_to_graph.find(operation_index);
-      if (graph_index == options.partial_graph_options.index_to_graph.end())
-      {
-        throw std::runtime_error("Invalid Partition Map");
-      }
-      auto partition = partial_graph->at(graph_index->second);
-
-      auto operand_io = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
-                        ir::Remove::UNDEFINED;
-      for (auto operand_index : operand_io)
-      {
-        if (partition->operands().exist(operand_index))
-          continue;
-
-        const auto &operand = primary_subgraph()->operands().at(operand_index);
-
-        auto new_operand = std::make_unique<ir::Operand>(operand);
-        new_operand->clearDefUse();
-
-        auto new_operand_index = partition->addOperand(operand_index, std::move(new_operand));
-        UNUSED_RELEASE(new_operand_index);
-        assert(new_operand_index == operand_index);
-      }
-
-      auto new_operation_index = partition->addOperation(operation_index, clone(operation));
-      UNUSED_RELEASE(new_operation_index);
-      assert(new_operation_index == operation_index);
-    });
-
-  for (uint32_t idx = 0; idx < partial_graph->subgraphs_count(); idx++)
-  {
-    auto partition = partial_graph->at(ir::SubgraphIndex{idx});
-
-    partition->operands().iterate([&](const ir::OperandIndex &operand_index,
-                                      const ir::Operand &operand) {
-      if (primary_subgraph()->getInputs().contains(operand_index) ||
-          (!operand.getDef().valid() && !operand.isConstant()))
-      {
-        partition->addInput(operand_index, primary_subgraph()->tensor_names()->at(operand_index));
-      }
-      if (primary_subgraph()->getOutputs().contains(operand_index) || operand.getUses().size() == 0)
-      {
-        partition->addOutput(operand_index, primary_subgraph()->tensor_names()->at(operand_index));
-      }
-
-      if (primary_subgraph()->operands().at(operand_index).getUses().size() > 1 &&
-          !primary_subgraph()->operands().at(operand_index).isConstant() &&
-          !partition->getInputs().contains(operand_index))
-      {
-        auto use_operations = primary_subgraph()->operands().at(operand_index).getUses();
-        auto iter = use_operations.begin();
-        ir::SubgraphIndex graph_index =
-          options.partial_graph_options.index_to_graph.find(*iter++)->second;
-        while (iter != use_operations.end())
-        {
-          if (graph_index != options.partial_graph_options.index_to_graph.find(*iter)->second &&
-              !partition->getOutputs().contains(operand_index))
-          {
-            partition->addOutput(operand_index,
-                                 primary_subgraph()->tensor_names()->at(operand_index));
-          }
-          iter++;
-        }
-      }
-    });
-
-    partition->verify();
-
-    bool same = true;
-    if (partition->getInputs().size() == primary_subgraph()->getInputs().size())
-    {
-      for (auto iter = partition->getInputs().begin(); iter != partition->getInputs().end(); ++iter)
-      {
-        if (!primary_subgraph()->getInputs().contains(*iter))
-        {
-          same = false;
-          break;
-        }
-      }
-      if (same == true)
-      {
-        partition->getInputs() = primary_subgraph()->getInputs();
-      }
-      else
-      {
-        partition->input_sort();
-      }
-    }
-
-    same = true;
-    if (partition->getOutputs().size() == primary_subgraph()->getOutputs().size())
-    {
-      for (auto iter = partition->getOutputs().begin(); iter != partition->getOutputs().end();
-           ++iter)
-      {
-        if (!primary_subgraph()->getOutputs().contains(*iter))
-        {
-          same = false;
-          break;
-        }
-      }
-      if (same == true)
-      {
-        partition->getOutputs() = primary_subgraph()->getOutputs();
-      }
-      else
-      {
-        partition->output_sort();
-      }
-    }
-  }
-  return true;
+  // Use for single model only
+  assert(nnpkg->model_count() == 1);
 }
 
 std::shared_ptr<CompilerArtifact> Compiler::compile(void)
 {
-  for (auto options : _voptions)
-  {
-    // Set control flow backend for control flow operators
-    auto &builtin_id = backend::builtin::Config::ID;
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id;
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id;
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id;
-
-    // FIXME This is a workaround for bcq operations, should remove it
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq";
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq";
-
-    // FIXME This is a workaround for bulk operations, should remove it
-    options->manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix";
-
-    verboseOptions(*options);
-  }
-
-  // NYI: allow one model compilation
-  auto const model_count = _nnpkg->model_count();
-  if (model_count != _voptions.size())
-    throw std::runtime_error{"Model count and option vector size mismatch"};
-
-  for (uint32_t i = 0; i < model_count; i++)
-  {
-    _nnpkg->model(ir::ModelIndex{i})->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
-      // Mandatory passes
-      pass::PassRunner{}
-        .append(std::make_unique<pass::ConstantOutputPass>(subg))
-        .append(std::make_unique<pass::OddOutputPass>(subg))
-        .run();
-
-      // Optimizations
-      pass::PassRunner{}.append(std::make_unique<pass::UnusedOperandEliminationPass>(subg)).run();
-    });
-  }
-
   /***************************************************
    * Prepare compilation phase
    ***************************************************/
-  // Compilable check
-  // TODO: Support hybrid execution -
-  //       execution between interpreter and compiled executor (including control flow)
-  if (_voptions[0]->disable_compile)
-  {
-    if (model_count > 1)
-      throw std::runtime_error{"NYI: Disable compilation for multi model is not supported yet"};
+  if (!_options)
+    throw std::runtime_error{"Empty compile option"};
 
-    auto executors = std::make_shared<exec::Executors>();
+  // Mode check
+  // TODO handle option for each model
+  if (_options->he_profiling_mode)
+  {
+    if (!_options->he_scheduler)
+      throw std::runtime_error("Heterogeneous scheduler must be enabled during profiling.");
 
-    _nnpkg->primary_model()->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
-      executors->emplace(index, std::make_unique<interp::InterpExecutor>(subg));
-    });
-    _state = State::COMPILED;
-    return std::make_shared<CompilerArtifact>(executors, nullptr);
+    if (_options->executor != "Dataflow")
+      throw std::runtime_error("Profiling mode works only with 'Dataflow' executor");
   }
 
-  // Mode check
-  // TODO handle option for each model
-  if (_voptions[0]->he_profiling_mode)
-    checkProfilerConditions();
+  _options->forceInternalOptions();
+  _options->verboseOptions();
+
+  _model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+    // Mandatory passes
+    pass::PassRunner{}
+      .append(std::make_unique<pass::ConstantOutputPass>(subg))
+      .append(std::make_unique<pass::OddOutputPass>(subg))
+      .run();
+
+    // Optimizations
+    pass::PassRunner{}.append(std::make_unique<pass::UnusedOperandEliminationPass>(subg)).run();
+  });
 
   /***************************************************
    * Backend independent analysis & optimization phase
    ***************************************************/
   // TODO Handle dump level for each model
-  auto dump_level = static_cast<dumper::dot::DotDumper::Level>(_voptions[0]->graph_dump_level);
+  auto dump_level = static_cast<dumper::dot::DotDumper::Level>(_options->graph_dump_level);
   onert::dumper::dot::DotDumper dot_dumper(dump_level);
 
   // Tracing context
   auto tracing_ctx = std::make_unique<util::TracingCtx>();
 
-  // Model edge context
-  std::unique_ptr<ir::ModelEdges> model_edges = nullptr;
-
   // Lower: Assign backend
   std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>> lowered_subgs;
-
-  if (model_count == 1)
   {
-    _nnpkg->primary_model()->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
-      dot_dumper.dump(subg, nnfw::misc::str("before_lower_subg-", index.value()));
+    _model->iterate([&](const ir::SubgraphIndex &subg_index, ir::Graph &subg) {
       // Lower: Assign backend
-      lowered_subgs[index] = std::make_unique<compiler::LoweredGraph>(subg, *_voptions[0]);
+      lowered_subgs[subg_index] = std::make_unique<compiler::LoweredGraph>(subg, *_options);
       // Set tracing_ctx for copied graph
-      tracing_ctx->setSubgraphIndex(&(lowered_subgs[index]->graph()), index.value());
+      if (tracing_ctx != nullptr)
+        tracing_ctx->setSubgraphIndex(&(lowered_subgs[subg_index]->graph()), subg_index.value());
     });
   }
-  else
-  {
-    // TODO Support tracing_ctx for multiple model
-    tracing_ctx = nullptr;
 
-    // Copy model edge context
-    model_edges = std::make_unique<ir::ModelEdges>(_nnpkg->model_edges());
+  _model.reset();
 
-    for (uint32_t i = 0; i < model_count; i++)
-    {
-      auto model = _nnpkg->model(ir::ModelIndex{i});
-      if (model->subgraphs_count() != 1)
-        throw std::runtime_error{"NYI: Lowering subgraphs for multiple model is not supported yet"};
-      auto subg = model->primary_subgraph();
-      dot_dumper.dump(*subg, nnfw::misc::str("before_lower_model-", i));
-
-      // For multimodel, model index is used for lowered graph index in lowered graph map
-      // and index type is SubgraphIndex
-      // TODO Find better way to represent lowered graph index for multimodel's subgraph
-      lowered_subgs[ir::SubgraphIndex{i}] =
-        std::make_unique<compiler::LoweredGraph>(*model->primary_subgraph(), *_voptions[i]);
-    }
-  }
-
-  _nnpkg.reset();
-
-  for (auto &pair : lowered_subgs)
+  for (const auto &pair : lowered_subgs)
   {
     const auto &subg_index = pair.first;
-    auto &lowered_subg = pair.second;
-    dot_dumper.dump(*lowered_subg, "after_lower_subg-" + std::to_string(subg_index.value()));
+    const auto &lowered_subg = pair.second;
+    dot_dumper.dump(*lowered_subg, nnfw::misc::str("after_lower_subg-", subg_index.value()));
   }
 
   // Shape inference.
@@ -566,28 +119,15 @@ std::shared_ptr<CompilerArtifact> Compiler::compile(void)
     // Run the StaticShapeInfer of primary subg. All child StaticShapeInferers are called
     // recursively
     std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>> inferers =
-      createStaticShapeInferers(lowered_subgs);
+      StaticShapeInferer::createStaticShapeInferers(lowered_subgs);
 
-    if (model_count == 1)
-    {
-      const auto primary_subg_idx = ir::SubgraphIndex{0};
-      inferers.at(primary_subg_idx)->infer();
+    const auto primary_subg_idx = ir::SubgraphIndex{0};
+    inferers.at(primary_subg_idx)->infer();
 
-      for (const auto &pair : inferers)
-      {
-        const auto inferer = pair.second.get();
-        inferer->dump();
-      }
-    }
-    else
+    for (const auto &pair_inferer : inferers)
     {
-      // Assume multi model has only one subgraph on each model
-      for (const auto &pair : inferers)
-      {
-        const auto inferer = pair.second.get();
-        inferer->infer();
-        inferer->dump();
-      }
+      const auto inferer = pair_inferer.second.get();
+      inferer->dump();
     }
   }
 
@@ -598,7 +138,7 @@ std::shared_ptr<CompilerArtifact> Compiler::compile(void)
   //      - Check parameter value validation which valid value is depend on input tensor shape
   //      - Output tensor shape validation check is needless because
   //        static/dynamic shape inferer will make valid output shape
-  for (auto &pair : lowered_subgs)
+  for (const auto &pair : lowered_subgs)
   {
     auto &lowered_subg = pair.second;
     compiler::ShapeValidator{lowered_subg->graph()}();
@@ -607,240 +147,30 @@ std::shared_ptr<CompilerArtifact> Compiler::compile(void)
   /*************************************************************
    *  Backend independent analysis & optimization phase finished
    *************************************************************/
-  auto executors = std::make_shared<exec::Executors>(std::move(model_edges));
-  for (auto &pair : lowered_subgs)
+  auto executors = std::make_shared<exec::SingleModelExecutors>();
+  for (auto &&pair : lowered_subgs)
   {
-    const auto &subg_index = pair.first;
+    auto const model_index = ir::ModelIndex{0};
+    auto const subg_index = pair.first;
     auto &lowered_subg = pair.second;
-    auto indexed_ranks = lowered_subg->indexed_ranks();
+    auto const indexed_ranks = lowered_subg->indexed_ranks();
 
     ir::OperationDumper dumper("Executor generation of Subgraph " +
                                std::to_string(subg_index.value()));
     lowered_subg->graph().operations().iterate(
       [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
 
-    auto &options = (model_count > 1) ? *_voptions[subg_index.value()] : *_voptions[0];
     auto executor = std::unique_ptr<exec::IExecutor>{ExecutorFactory::get().create(
-      std::move(lowered_subg), tracing_ctx.get(), options, executors)};
+      std::move(lowered_subg), tracing_ctx.get(), *_options, executors, model_index)};
     executor->setIndexedRanks(indexed_ranks);
-    executors->emplace(subg_index, std::move(executor));
+    executors->emplace(model_index, subg_index, std::move(executor));
   }
 
   /********************************
    * Code generation phase finished
    ********************************/
-  _state = State::COMPILED;
   return std::make_shared<CompilerArtifact>(executors, std::move(tracing_ctx));
 }
 
-std::vector<std::shared_ptr<CompilerArtifact>> Compiler::compile(const char *package_file_path,
-                                                                 const char *map_file_path)
-{
-  // Allow one model compilation for pipeline
-  if (_nnpkg->model_count() != 1)
-    throw std::runtime_error{"Multiple models compilation for pipeline is not supported yet."};
-  assert(_voptions.size() == 1);
-
-  auto model = _nnpkg->primary_model();
-  auto &options = *_voptions[0];
-
-  std::string package_path(package_file_path);
-  std::string partition_map_file;
-
-  if (map_file_path)
-  {
-    partition_map_file = map_file_path;
-  }
-  else
-  {
-    partition_map_file = package_path + "/partition_map.json";
-  }
-
-  std::ifstream pmfs(partition_map_file);
-  Json::Value root;
-  pmfs >> root;
-  const Json::Value &map = root["partition_map"];
-  const Json::Value &np = root["num_partitions"];
-
-  uint32_t num_graphs = 1;
-
-  if (pmfs.is_open())
-  {
-    num_graphs = np.asUInt();
-    for (uint32_t i = 0; i < (uint32_t)map.size(); ++i)
-    {
-      options.partial_graph_options.index_to_graph[ir::OperationIndex{i}] =
-        ir::SubgraphIndex{map[i].asUInt()};
-    }
-  }
-  else
-  {
-    throw std::runtime_error("There is no partition map file");
-  }
-
-  if (!buildPartialGraph(num_graphs))
-  {
-    throw std::runtime_error("It doesn't support in case there are subgraphs");
-  }
-
-  // Set control flow backend for control flow operators
-  {
-    auto &builtin_id = backend::builtin::Config::ID;
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id;
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id;
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id;
-  }
-
-  // FIXME This is a workaround for bcq operations, should remove it
-  {
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq";
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq";
-  }
-
-  // FIXME This is a workaround for bulk operations, should remove it
-  {
-    options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix";
-  }
-
-  verboseOptions(options);
-
-  model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
-    // Mandatory passes
-    auto part = subg.partialgraphs();
-    part->iterate([&](const ir::SubgraphIndex &, ir::Graph &partialgraph) {
-      pass::PassRunner{}
-        .append(std::make_unique<pass::ConstantOutputPass>(partialgraph))
-        .append(std::make_unique<pass::OddOutputPass>(partialgraph))
-        .run();
-
-      // Optimizations
-      pass::PassRunner{}
-        .append(std::make_unique<pass::UnusedOperandEliminationPass>(partialgraph))
-        .run();
-    });
-  });
-
-  /***************************************************
-   * Prepare compilation phase
-   ***************************************************/
-
-  // Compilable check
-  // TODO: Support hybrid execution -
-  //       execution between interpreter and compiled executor (including control flow)
-  if (options.disable_compile)
-  {
-    std::vector<std::shared_ptr<CompilerArtifact>> results;
-    auto executors = std::make_shared<exec::Executors>();
-
-    model->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
-      executors->emplace(index, std::make_unique<interp::InterpExecutor>(subg));
-    });
-    results.push_back(std::make_shared<CompilerArtifact>(executors, nullptr));
-    _state = State::COMPILED;
-    return results;
-  }
-
-  // Mode check
-  if (options.he_profiling_mode)
-    checkProfilerConditions();
-
-  /***************************************************
-   * Backend independent analysis & optimization phase
-   ***************************************************/
-  auto dump_level = static_cast<dumper::dot::DotDumper::Level>(options.graph_dump_level);
-  onert::dumper::dot::DotDumper dot_dumper_part(dump_level);
-
-  // Lower: Assign backend
-  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
-    lowered_partialgraphs;
-  model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
-    auto part = subg.partialgraphs();
-    part->iterate([&](const ir::SubgraphIndex &pindex, ir::Graph &partialgraph) {
-      dot_dumper_part.dump(partialgraph,
-                           nnfw::misc::str("before_lower_subg_partialgraph-", pindex.value()));
-
-      // // Lower: Assign backend
-      lowered_partialgraphs[pindex] =
-        std::make_unique<compiler::LoweredGraph>(subg, partialgraph, options);
-    });
-  });
-
-  for (auto &pair : lowered_partialgraphs)
-  {
-
-    const auto &partialgraph_index = pair.first;
-    auto &lowered_partialgraph = pair.second;
-    dot_dumper_part.dump(*lowered_partialgraph, "after_lower_subg_partialgraph-" +
-                                                  std::to_string(partialgraph_index.value()));
-  }
-
-  // Partial Graph shape inference
-  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>> inferers =
-    createStaticShapeInferers(lowered_partialgraphs);
-  // NOTE If partialgraph has subgraphs StaticShapeInferer may be called multiple times
-  for (auto &pair : lowered_partialgraphs)
-  {
-    const auto &partialgraph_index = pair.first;
-    const auto partial_inferer = inferers.at(partialgraph_index).get();
-    partial_inferer->infer();
-    partial_inferer->dump();
-  }
-
-  // Shape validation
-  // TODO Move shape independent feature check from ShapeValidator to OperationValidator
-  // TODO Move ShapeValidator into shape inference
-  //      - Check input tensor shape validation
-  //      - Check parameter value validation which valid value is depend on input tensor shape
-  //      - Output tensor shape validation check is needless because
-  //        static/dynamic shape inferer will make valid output shape
-  for (auto &pair : lowered_partialgraphs)
-  {
-    auto &lowered_partialgraph = pair.second;
-    compiler::ShapeValidator{lowered_partialgraph->graph()}();
-  }
-
-  /*************************************************************
-   *  Backend independent analysis & optimization phase finished
-   *************************************************************/
-  std::map<uint32_t, std::unique_ptr<compiler::LoweredGraph>> ordered;
-  for (auto &pair : lowered_partialgraphs)
-  {
-    // const auto &partialgraph_index = pair.first;
-    auto &lowered_partialgraph = pair.second;
-
-    ordered.insert(make_pair(pair.first.value(), std::move(lowered_partialgraph)));
-  }
-
-  std::vector<std::shared_ptr<CompilerArtifact>> results;
-  for (auto &pair : ordered)
-  {
-    auto executors = std::make_shared<exec::Executors>();
-
-    const auto &partialgraph_index = ir::SubgraphIndex(pair.first);
-    auto &lowered_partialgraph = pair.second;
-    auto indexed_ranks = lowered_partialgraph->indexed_ranks();
-    ir::OperationDumper dumper("Executor generation of Subgraph " +
-                               std::to_string(partialgraph_index.value()));
-    lowered_partialgraph->graph().operations().iterate(
-      [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
-    auto executor = std::unique_ptr<exec::IExecutor>{
-      ExecutorFactory::get().create(std::move(lowered_partialgraph), nullptr, options, executors)};
-    executor->setIndexedRanks(indexed_ranks);
-    executors->emplace(ir::SubgraphIndex{0}, std::move(executor));
-
-    // It doesn't support tracing in case of partial graph
-    results.push_back(std::make_shared<CompilerArtifact>(executors, nullptr));
-  }
-
-  _nnpkg.reset();
-  /********************************
-   * Code generation phase finished
-   ********************************/
-  _state = State::COMPILED;
-
-  return results;
-}
-
 } // namespace compiler
-
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/CompilerFactory.cc b/runtime/onert/core/src/compiler/CompilerFactory.cc
new file mode 100644
index 000000000..d8d4bb277
--- /dev/null
+++ b/runtime/onert/core/src/compiler/CompilerFactory.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compiler/CompilerFactory.h"
+
+#include "MultiModelCompiler.h"
+
+#include "compiler/Compiler.h"
+
+namespace onert
+{
+namespace compiler
+{
+
+CompilerFactory &CompilerFactory::get()
+{
+  static CompilerFactory singleton;
+  return singleton;
+}
+
+std::unique_ptr<ICompiler>
+CompilerFactory::create(const std::shared_ptr<ir::NNPkg> &nnpkg,
+                        std::vector<std::unique_ptr<CompilerOptions>> &copts)
+{
+  if (nnpkg->model_count() == 1)
+    return std::make_unique<Compiler>(nnpkg, copts);
+
+  return std::make_unique<MultiModelCompiler>(nnpkg, copts);
+}
+
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/CompilerOptions.cc b/runtime/onert/core/src/compiler/CompilerOptions.cc
new file mode 100644
index 000000000..b5fd392e0
--- /dev/null
+++ b/runtime/onert/core/src/compiler/CompilerOptions.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compiler/CompilerOptions.h"
+
+#include "../backend/builtin/Backend.h"
+
+#include "util/ConfigSource.h"
+#include "util/logging.h"
+
+#include <misc/string_helpers.h>
+
+namespace
+{
+
+using namespace onert;
+
+std::string getOpBackends(std::unordered_map<ir::OpCode, std::string> &opcode_to_backend)
+{
+  std::unordered_map<ir::OpCode, std::string>::iterator it;
+  std::string opbackends;
+
+  for (it = opcode_to_backend.begin(); it != opcode_to_backend.end(); ++it)
+  {
+    if (!opbackends.empty())
+      opbackends = opbackends + ", ";
+
+    auto opcode = it->first;
+    const std::string opname = ir::toString(opcode);
+    opbackends += opname + "=" + it->second;
+  }
+  return opbackends;
+}
+
+} // namespace
+
+namespace onert
+{
+namespace compiler
+{
+
+void ManualSchedulerOptions::setBackendMap(const std::string &str)
+{
+  // TODO Support multiple subgraphs for manual scheduling
+  auto key_val_list = nnfw::misc::split(str, ';');
+  for (const auto &key_val_str : key_val_list)
+  {
+    if (key_val_str.empty())
+    {
+      continue;
+    }
+
+    auto key_val = nnfw::misc::split(key_val_str, '=');
+    const auto &key_str = key_val.at(0);
+    const auto &val = key_val.at(1);
+    auto key = static_cast<uint32_t>(std::stoi(key_str));
+    this->index_to_backend.emplace(ir::OperationIndex{key}, val);
+  }
+}
+
+std::unique_ptr<CompilerOptions> CompilerOptions::fromGlobalConfig()
+{
+  auto o = std::make_unique<CompilerOptions>();
+  o->backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';');
+  o->trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH);
+  o->graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP);
+  o->executor = util::getConfigString(util::config::EXECUTOR);
+  o->he_scheduler = util::getConfigBool(util::config::USE_SCHEDULER);
+  o->he_profiling_mode = util::getConfigBool(util::config::PROFILING_MODE);
+  o->fp16_enable = util::getConfigBool(util::config::FP16_ENABLE);
+  {
+    // Backend for all
+    auto &ms_options = o->manual_scheduler_options;
+
+    // Default value for op_backend_all is first element in the backend list
+    ms_options.backend_for_all = util::getConfigString(util::config::OP_BACKEND_ALLOPS);
+
+// Opcode to Backend
+#define OP(OpName)                                                                      \
+  {                                                                                     \
+    const auto &backend_str = util::getConfigString(util::config::OP_BACKEND_##OpName); \
+    if (!backend_str.empty())                                                           \
+    {                                                                                   \
+      ms_options.opcode_to_backend[ir::OpCode::OpName] = backend_str;                   \
+    }                                                                                   \
+  }
+#include "ir/Operations.lst"
+#undef OP
+
+    // Index to Backend
+    auto map_str = util::getConfigString(util::config::OP_BACKEND_MAP);
+    ms_options.setBackendMap(map_str);
+  }
+  return o;
+}
+
+void CompilerOptions::forceInternalOptions()
+{
+  // Set control flow backend for control flow operators
+  auto &builtin_id = backend::builtin::Config::ID;
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id;
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id;
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id;
+
+  // FIXME This is a workaround for bcq operations, should remove it
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq";
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq";
+
+  // FIXME This is a workaround for bulk operations, should remove it
+  manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix";
+}
+
+void CompilerOptions::verboseOptions()
+{
+  VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl;
+  VERBOSE(Compiler) << "backend_list             : "
+                    << nnfw::misc::join(backend_list.begin(), backend_list.end(), "/") << std::endl;
+  VERBOSE(Compiler) << "trace_filepath           : " << trace_filepath << std::endl;
+  VERBOSE(Compiler) << "graph_dump_level         : " << graph_dump_level << std::endl;
+  VERBOSE(Compiler) << "executor                 : " << executor << std::endl;
+  VERBOSE(Compiler) << "manual backend_for_all   : " << manual_scheduler_options.backend_for_all
+                    << std::endl;
+  VERBOSE(Compiler) << "manual_scheduler_options : "
+                    << getOpBackends(manual_scheduler_options.opcode_to_backend) << std::endl;
+  VERBOSE(Compiler) << "he_scheduler             : " << he_scheduler << std::endl;
+  VERBOSE(Compiler) << "he_profiling_mode        : " << he_profiling_mode << std::endl;
+  VERBOSE(Compiler) << "fp16_enable              : " << fp16_enable << std::endl
+                    << std::noboolalpha;
+}
+
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index 024556e7e..b09d6b021 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -196,7 +196,7 @@ backend::BackendContexts createBackendContexts(compiler::LoweredGraph &lgraph, b
 
   // Create contexts
   auto whole_op_order = lgraph.graph().topolSortOperations();
-  for (auto &pair : context_data_map)
+  for (auto &&pair : context_data_map)
   {
     auto backend = pair.first;
     auto &data = pair.second;
@@ -240,18 +240,22 @@ ExecutorFactory &ExecutorFactory::get()
 ExecutorFactory::ExecutorFactory()
 {
   _map["Linear"] = createLinearExecutor;
-  _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
-                               std::placeholders::_3, std::placeholders::_4, false);
-  _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
-                               std::placeholders::_3, std::placeholders::_4, true);
+  _map["Dataflow"] =
+    std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
+              std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false);
+  _map["Parallel"] =
+    std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2,
+              std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true);
 }
 
 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
                                          const util::TracingCtx *tracing_ctx,
                                          const compiler::CompilerOptions &options,
-                                         const std::shared_ptr<exec::Executors> &executors)
+                                         const std::shared_ptr<exec::IExecutors> &executors,
+                                         const ir::ModelIndex &index)
 {
-  return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors);
+  return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors,
+                                   index);
 }
 
 void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
@@ -282,10 +286,11 @@ void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_grap
 }
 
 void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs,
-                                            const std::shared_ptr<exec::Executors> &executors,
-                                            const backend::BackendContexts &backend_contexts)
+                                            const std::shared_ptr<exec::IExecutors> &executors,
+                                            const backend::BackendContexts &backend_contexts,
+                                            const ir::ModelIndex &index)
 {
-  for (auto &pair : backend_contexts)
+  for (auto &&pair : backend_contexts)
   {
     auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
     if (builtin_context != nullptr)
@@ -293,6 +298,7 @@ void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs,
       auto builtin_kernel_gen = builtin_context->kernel_gen;
       builtin_kernel_gen->setTensorRegistries(tensor_regs);
       builtin_kernel_gen->setExecutors(executors);
+      builtin_kernel_gen->setModelIndex(index);
     }
   }
 }
@@ -302,7 +308,7 @@ ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_con
 {
   std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
 
-  for (auto &pair : backend_contexts)
+  for (auto &&pair : backend_contexts)
   {
     // NOTE builtin backend must be processed lastly.
     // This is because of Permute layer's specialty which is the only operation that could have
@@ -319,7 +325,8 @@ ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_con
 
 exec::IExecutor *ExecutorFactory::createLinearExecutor(
   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
-  const compiler::CompilerOptions &options, const std::shared_ptr<exec::Executors> &executors)
+  const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
+  const ir::ModelIndex &index)
 {
   auto &graph = lowered_graph->graph();
 
@@ -337,7 +344,7 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor(
   auto order = Linear::linearize(*lowered_graph);
   Linear::dump(*lowered_graph, order);
 
-  for (auto &pair : backend_contexts)
+  for (auto &&pair : backend_contexts)
   {
     pair.second->genTensors();
   }
@@ -345,7 +352,7 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor(
   prepareMigrantTensors(*lowered_graph, backend_contexts);
 
   // Give some runtime objects to builtin KernelGenerator
-  prepareBuiltinBackend(tensor_regs, executors, backend_contexts);
+  prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
 
   ExecutionBuilder builder;
 
@@ -406,10 +413,10 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor(
   }
 
   // Generate kernels
-  for (auto &pair : ordered_contexts)
+  for (auto &&pair : ordered_contexts)
   {
     auto codes = pair.second->genKernels();
-    for (auto &pair : codes)
+    for (auto &&pair : codes)
     {
       auto &op_ind = pair.first;
       auto &fn_seq = pair.second;
@@ -444,8 +451,8 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor(
 
 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
-  const compiler::CompilerOptions &options, const std::shared_ptr<exec::Executors> &executors,
-  bool parallel)
+  const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
+  const ir::ModelIndex &index, bool parallel)
 {
   backend::BackendContexts backend_contexts =
     createBackendContexts(*lowered_graph, options.executor == "Linear");
@@ -457,7 +464,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
       ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
-  for (auto &pair : backend_contexts)
+  for (auto &&pair : backend_contexts)
   {
     pair.second->genTensors();
   }
@@ -465,7 +472,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   prepareMigrantTensors(*lowered_graph, backend_contexts);
 
   // Give some runtime objects to builtin KernelGenerator
-  prepareBuiltinBackend(tensor_regs, executors, backend_contexts);
+  prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index);
 
   ExecutionBuilder builder;
 
@@ -473,10 +480,10 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   auto ordered_contexts = orderBackendContext(backend_contexts);
 
   // Generate kernels
-  for (auto &pair : ordered_contexts)
+  for (auto &&pair : ordered_contexts)
   {
     auto codes = pair.second->genKernels();
-    for (auto &pair : codes)
+    for (auto &&pair : codes)
     {
       auto &op_ind = pair.first;
       auto &fn_seq = pair.second;
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index 70c089f8c..f8f989043 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -21,7 +21,7 @@
 
 #include "backend/ITensor.h"
 #include "compiler/LoweredGraph.h"
-#include "exec/Executors.h"
+#include "exec/IExecutors.h"
 
 #include <deque>
 #include <unordered_map>
@@ -40,7 +40,8 @@ public:
   exec::IExecutor *create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
                           const util::TracingCtx *tracing_ctx,
                           const compiler::CompilerOptions &options,
-                          const std::shared_ptr<exec::Executors> &executors);
+                          const std::shared_ptr<exec::IExecutors> &executors,
+                          const ir::ModelIndex &index);
 
 private:
   ExecutorFactory();
@@ -49,26 +50,28 @@ private:
   static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
                                     const backend::BackendContexts &backend_contexts);
   static void prepareBuiltinBackend(const TensorRegistries &tensor_regs,
-                                    const std::shared_ptr<exec::Executors> &executors,
-                                    const backend::BackendContexts &backend_contexts);
+                                    const std::shared_ptr<exec::IExecutors> &executors,
+                                    const backend::BackendContexts &backend_contexts,
+                                    const ir::ModelIndex &index);
   static std::deque<std::pair<const backend::Backend *, backend::BackendContext *>>
   orderBackendContext(const backend::BackendContexts &backend_contexts);
 
   static exec::IExecutor *createLinearExecutor(
     std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
-    const compiler::CompilerOptions &options, const std::shared_ptr<exec::Executors> &executors);
-  static exec::IExecutor *
-  createDataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                         const util::TracingCtx *tracing_ctx,
-                         const compiler::CompilerOptions &options,
-                         const std::shared_ptr<exec::Executors> &executors, bool parallel);
+    const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
+    const ir::ModelIndex &index);
+  static exec::IExecutor *createDataflowExecutor(
+    std::unique_ptr<compiler::LoweredGraph> lowered_graph, const util::TracingCtx *tracing_ctx,
+    const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
+    const ir::ModelIndex &index, bool parallel);
 
 private:
   std::unordered_map<
     std::string,
     std::function<exec::IExecutor *(
       std::unique_ptr<compiler::LoweredGraph>, const util::TracingCtx *tracing_ctx,
-      const compiler::CompilerOptions &options, const std::shared_ptr<exec::Executors> &executors)>>
+      const compiler::CompilerOptions &options, const std::shared_ptr<exec::IExecutors> &executors,
+      const ir::ModelIndex &index)>>
     _map;
 };
 
diff --git a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
index 98dc906e4..fdf4e24f0 100644
--- a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
+++ b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
@@ -393,10 +393,10 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq)
   const auto &op_seq_inputs = _lowered_graph.graph().getInputs();
   const auto &op_seq_outputs = _lowered_graph.graph().getOutputs();
 
-  for (auto &op_idx : op_seq)
+  for (const auto &op_idx : op_seq)
   {
     const auto &node = operations.at(op_idx);
-    for (auto &ind : node.getInputs() | ir::Remove::UNDEFINED)
+    for (const auto &ind : node.getInputs() | ir::Remove::UNDEFINED)
     {
       if (node.opcode() == ir::OpCode::ConvertFp32ToFp16 || op_seq_inputs.contains(ind))
         continue;
@@ -410,7 +410,7 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq)
       VERBOSE(Fp32ToFp16Converter) << "Input Operand " << ind << ": fp16" << std::endl;
     }
 
-    for (auto &ind : node.getOutputs())
+    for (const auto &ind : node.getOutputs())
     {
       if (node.opcode() == ir::OpCode::ConvertFp16ToFp32 || op_seq_outputs.contains(ind))
         continue;
@@ -747,7 +747,7 @@ Fp32ToFp16Converter::findOpSequencesContiguous(const InputToOpSeqs &input_to_op_
     //    |                       |
     // [OPERATION]             [OPERATION]
     //
-    for (auto &op_seq_ind : found_input_in_op_seqs->second)
+    for (const auto &op_seq_ind : found_input_in_op_seqs->second)
     {
       auto found_in_fp32_to_fp16 = _list_fp32_to_fp16.find(op_seq_ind);
       if (found_in_fp32_to_fp16 != _list_fp32_to_fp16.end())
@@ -799,13 +799,13 @@ Fp32ToFp16Converter::getListOpSequences(const OpSeqIndexToOpSeqIndexList &opseq_
   OpSeqIndexList list;
   for (const auto &it : opseq_map_to_delete)
   {
-    auto &opseq_ind_fp16_to_fp32 = it.first;
+    const auto &opseq_ind_fp16_to_fp32 = it.first;
     if (list.find(opseq_ind_fp16_to_fp32) == list.end())
     {
       list.emplace(opseq_ind_fp16_to_fp32);
     }
 
-    for (auto &opseq_ind_fp32_to_fp16 : it.second)
+    for (const auto &opseq_ind_fp32_to_fp16 : it.second)
     {
       if (list.find(opseq_ind_fp32_to_fp16) == list.end())
       {
@@ -869,7 +869,7 @@ void Fp32ToFp16Converter::manipulateContiguousOpSequences(
     auto &op_seq_fp16_to_fp32 = op_seqs.at(op_seq_ind_fp16_to_fp32);
     auto &input_ind_fp16_to_fp32 = op_seq_fp16_to_fp32.getInputs().at(0);
 
-    for (auto &op_seq_ind_fp32_to_fp16 : it.second)
+    for (const auto &op_seq_ind_fp32_to_fp16 : it.second)
     {
       auto &op_seq_fp32_to_fp16 = op_seqs.at(op_seq_ind_fp32_to_fp16);
       assert(op_seq_fp32_to_fp16.size() == 1);
@@ -879,7 +879,7 @@ void Fp32ToFp16Converter::manipulateContiguousOpSequences(
       auto found_next_to_fp16 = input_to_op_seqs.find(output_ind_fp32_to_fp16);
       assert(found_next_to_fp16 != input_to_op_seqs.end());
 
-      for (auto &op_seq_ind_next_to_fp16 : found_next_to_fp16->second)
+      for (const auto &op_seq_ind_next_to_fp16 : found_next_to_fp16->second)
       {
         manipulateInput(op_seq_ind_next_to_fp16, output_ind_fp32_to_fp16, input_ind_fp16_to_fp32);
       }
@@ -901,7 +901,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
   auto &operations = _lowered_graph.graph().operations();
   auto &op_seqs = _lowered_graph.op_seqs();
 
-  for (auto &op_seq_ind : list_to_delete_op_seqs)
+  for (const auto &op_seq_ind : list_to_delete_op_seqs)
   {
     auto &op_seq = op_seqs.at(op_seq_ind);
     assert(op_seq.size() == 1);
@@ -914,7 +914,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
     VERBOSE(Fp32ToFp16Converter) << "Delete Node " << first_node_ind << std::endl;
 
     // Uses
-    for (auto &ind : first_node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
+    for (const auto &ind : first_node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       auto &obj = operands.at(ind);
       obj.removeUse(first_node_ind);
@@ -923,7 +923,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
     }
 
     // Def
-    for (auto &ind : first_node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
+    for (const auto &ind : first_node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       auto &obj = operands.at(ind);
       assert(obj.getDef() == first_node_ind);
@@ -942,7 +942,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
   }
 
   // Operand
-  for (auto &ind : list_to_delete_ops)
+  for (const auto &ind : list_to_delete_ops)
   {
     operands.remove(ind);
     VERBOSE(Fp32ToFp16Converter) << "Operand " << ind << " is removed" << std::endl;
diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc
index c4bfddb8f..65fd4cd77 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.cc
@@ -512,7 +512,7 @@ HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::Operation
   // Find free time for data transferring and insert it into backend taskset. This is needed:
   //  1. Time for multiple permutations for this node's input is found correctly
   //  2. If backend==cpu, then free time for this node must come after permutations
-  for (auto &it : transfer_st_exec_time)
+  for (auto &&it : transfer_st_exec_time)
   {
     if (_is_parallel_exec)
     {
diff --git a/runtime/onert/core/src/compiler/HEScheduler.test.cc b/runtime/onert/core/src/compiler/HEScheduler.test.cc
index c4a2df025..589331b49 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.test.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.test.cc
@@ -163,7 +163,7 @@ void setOperationsExecutionTime(const std::vector<const Backend *> &backends,
   ExecTime et(backends);
   for (int i = 0; i < op_names.size(); ++i)
   {
-    for (auto &backend : backends)
+    for (const auto backend : backends)
       setOperationExecTime(et, backend, op_names[i], false, op_sizes[i], exec_time);
   }
   et.storeOperationsExecTime();
@@ -189,7 +189,7 @@ void setPermutationsExecutionTime(const std::vector<const Backend *> &backends,
   ExecTime et(backends);
   for (const auto &backend : backends)
   {
-    for (auto &other_backend : backends)
+    for (const auto other_backend : backends)
     {
       if (backend == other_backend)
         continue;
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index 9e84753a7..d53d0ed00 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -44,14 +44,6 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
   lowerGraph(options);
 }
 
-// TODO Design better class and constructor to represent parent_graph
-LoweredGraph::LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph,
-                           const CompilerOptions &options)
-  : _graph{graph}, _parent_graph{parent_graph}
-{
-  lowerGraph(options);
-}
-
 void LoweredGraph::lowerGraph(const CompilerOptions &options)
 {
   // Build backend contexts
diff --git a/runtime/onert/core/src/compiler/ManualScheduler.cc b/runtime/onert/core/src/compiler/ManualScheduler.cc
index af2d84cd9..621f0c7b7 100644
--- a/runtime/onert/core/src/compiler/ManualScheduler.cc
+++ b/runtime/onert/core/src/compiler/ManualScheduler.cc
@@ -64,7 +64,7 @@ std::unique_ptr<BackendResolver> ManualScheduler::schedule(const ir::Graph &grap
 
   // 2. Backend per operation type
   std::unordered_map<ir::OpCode, backend::Backend *> op_type_map;
-  for (auto &pair : manual_options.opcode_to_backend)
+  for (const auto &pair : manual_options.opcode_to_backend)
   {
     op_type_map.emplace(pair.first, BackendManager::get().get(pair.second));
   }
@@ -80,7 +80,7 @@ std::unique_ptr<BackendResolver> ManualScheduler::schedule(const ir::Graph &grap
   });
 
   // 3. Backend per operation
-  for (auto &pair : manual_options.index_to_backend)
+  for (const auto &pair : manual_options.index_to_backend)
   {
     const auto &key = pair.first;
     const auto &val = pair.second;
diff --git a/runtime/onert/core/src/compiler/MultiModelCompiler.cc b/runtime/onert/core/src/compiler/MultiModelCompiler.cc
new file mode 100644
index 000000000..fea6a7f25
--- /dev/null
+++ b/runtime/onert/core/src/compiler/MultiModelCompiler.cc
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MultiModelCompiler.h"
+
+#include "ExecutorFactory.h"
+#include "ShapeValidator.h"
+#include "pass/ConstantOutputPass.h"
+#include "pass/OddOutputPass.h"
+#include "pass/PassRunner.h"
+#include "pass/UnusedOperandEliminationPass.h"
+#include "../dumper/dot/DotDumper.h"
+#include "../exec/Executors.h"
+#include "../ir/OperationDumper.h"
+#include "../ir/verifier/Verifier.h"
+
+#include "compiler/StaticShapeInferer.h"
+
+#include <misc/string_helpers.h>
+
+namespace onert
+{
+namespace compiler
+{
+
+MultiModelCompiler::MultiModelCompiler(const std::shared_ptr<ir::NNPkg> &nnpkg,
+                                       std::vector<std::unique_ptr<CompilerOptions>> &copts)
+  : _nnpkg{nnpkg}, _voptions{}
+{
+  assert(nnpkg->model_count() != 1);
+
+  for (uint32_t i = 0; i < copts.size(); i++)
+  {
+    _voptions.push_back(copts[i].get());
+  }
+}
+
+std::shared_ptr<CompilerArtifact> MultiModelCompiler::compile(void)
+{
+  /***************************************************
+   * Prepare compilation phase
+   ***************************************************/
+  for (auto options : _voptions)
+  {
+    if (!options)
+      throw std::runtime_error{"Empty compile option"};
+
+    // Mode check
+    // TODO handle option for each model
+    if (options->he_profiling_mode)
+      throw std::runtime_error("NYI: Profiling mode for multiple model is not supported yet");
+
+    options->forceInternalOptions();
+    options->verboseOptions();
+  }
+
+  // NYI: allow one model compilation
+  auto const model_count = _nnpkg->model_count();
+  if (model_count != _voptions.size())
+    throw std::runtime_error{"Model count and option vector size mismatch"};
+
+  for (uint16_t i = 0; i < model_count; i++)
+  {
+    _nnpkg->model(ir::ModelIndex{i})->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+      // Mandatory passes
+      pass::PassRunner{}
+        .append(std::make_unique<pass::ConstantOutputPass>(subg))
+        .append(std::make_unique<pass::OddOutputPass>(subg))
+        .run();
+
+      // Optimizations
+      pass::PassRunner{}.append(std::make_unique<pass::UnusedOperandEliminationPass>(subg)).run();
+    });
+  }
+
+  /***************************************************
+   * Backend independent analysis & optimization phase
+   ***************************************************/
+  // TODO Handle dump level for each model
+  auto dump_level = static_cast<dumper::dot::DotDumper::Level>(_voptions[0]->graph_dump_level);
+  onert::dumper::dot::DotDumper dot_dumper(dump_level);
+
+  // Tracing context
+  // TODO Support tracing_ctx for multiple model
+  std::unique_ptr<util::TracingCtx> tracing_ctx = nullptr;
+
+  // Model edge context: copy model edge context
+  auto model_edges = std::make_unique<ir::ModelEdges>(_nnpkg->model_edges());
+
+  // Lower: Assign backend
+  std::unordered_map<ir::ModelIndex,
+                     std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>>
+    lowered_subgs;
+
+  for (uint16_t i = 0; i < model_count; i++)
+  {
+    auto const model_index = ir::ModelIndex{i};
+    auto model = _nnpkg->model(model_index);
+
+    model->iterate([&](const ir::SubgraphIndex &subg_index, ir::Graph &subg) {
+      dot_dumper.dump(subg,
+                      nnfw::misc::str("before_lower_model-", i, "-subg-", subg_index.value()));
+      // Lower: Assign backend
+      lowered_subgs[model_index][subg_index] =
+        std::make_unique<compiler::LoweredGraph>(subg, *_voptions[i]);
+      // Set tracing_ctx for copied graph
+      if (tracing_ctx != nullptr)
+        tracing_ctx->setSubgraphIndex(&(lowered_subgs[model_index][subg_index]->graph()),
+                                      subg_index.value());
+    });
+  }
+
+  _nnpkg.reset();
+
+  for (const auto &pair : lowered_subgs)
+  {
+    const auto &model_index = pair.first;
+    const auto &model_lsubg = pair.second;
+
+    for (const auto &pair_inner : model_lsubg)
+    {
+      const auto &subg_index = pair_inner.first;
+      const auto &lowered_subg = pair_inner.second;
+      dot_dumper.dump(*lowered_subg, nnfw::misc::str("after_lower_model-", model_index.value(),
+                                                     "-subg-", subg_index.value()));
+    }
+  }
+
+  // Shape inference.
+  for (auto &&pair : lowered_subgs)
+  {
+    auto &model_lsubgs = pair.second;
+    // Run the StaticShapeInfer of primary subg. All child StaticShapeInferers are called
+    // recursively
+    std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>> inferers =
+      StaticShapeInferer::createStaticShapeInferers(model_lsubgs);
+
+    const auto primary_subg_idx = ir::SubgraphIndex{0};
+    inferers.at(primary_subg_idx)->infer();
+
+    for (const auto &pair_inferer : inferers)
+    {
+      const auto inferer = pair_inferer.second.get();
+      inferer->dump();
+    }
+  }
+
+  // Shape validation
+  // TODO Move shape independent feature check from ShapeValidator to OperationValidator
+  // TODO Move ShapeValidator into shape inference
+  //      - Check input tensor shape validation
+  //      - Check parameter value validation which valid value is depend on input tensor shape
+  //      - Output tensor shape validation check is needless because
+  //        static/dynamic shape inferer will make valid output shape
+  for (const auto &pair : lowered_subgs)
+  {
+    const auto &model_lsubgs = pair.second;
+
+    for (const auto &pair_inner : model_lsubgs)
+    {
+      const auto &lowered_subg = pair_inner.second;
+      compiler::ShapeValidator{lowered_subg->graph()}();
+    }
+  }
+
+  /*************************************************************
+   *  Backend independent analysis & optimization phase finished
+   *************************************************************/
+  auto executors = std::make_shared<exec::Executors>(std::move(model_edges));
+  for (auto &&pair : lowered_subgs)
+  {
+    auto const &model_index = pair.first;
+    auto &model_lsubgs = pair.second;
+
+    for (auto &&pair_inner : model_lsubgs)
+    {
+      auto const subg_index = pair_inner.first;
+      auto &lowered_subg = pair_inner.second;
+      auto const indexed_ranks = lowered_subg->indexed_ranks();
+
+      ir::OperationDumper dumper("Executor generation of Subgraph " +
+                                 std::to_string(subg_index.value()));
+      lowered_subg->graph().operations().iterate(
+        [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
+
+      auto &options = *_voptions[model_index.value()];
+      auto executor = std::unique_ptr<exec::IExecutor>{ExecutorFactory::get().create(
+        std::move(lowered_subg), tracing_ctx.get(), options, executors, model_index)};
+      executor->setIndexedRanks(indexed_ranks);
+      executors->emplace(model_index, subg_index, std::move(executor));
+    }
+  }
+
+  /********************************
+   * Code generation phase finished
+   ********************************/
+  return std::make_shared<CompilerArtifact>(executors, std::move(tracing_ctx));
+}
+
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/MultiModelCompiler.h b/runtime/onert/core/src/compiler/MultiModelCompiler.h
new file mode 100644
index 000000000..89af664f8
--- /dev/null
+++ b/runtime/onert/core/src/compiler/MultiModelCompiler.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file  MultiModelCompiler.h
+ * @brief This file contains MultiModelCompiler class to define and run compilation phase
+ */
+
+#ifndef __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__
+#define __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__
+
+#include "compiler/CompilerOptions.h"
+#include "compiler/ICompiler.h"
+#include "ir/NNPkg.h"
+
+namespace onert
+{
+namespace compiler
+{
+
+/**
+ * @brief Class to compile NN package
+ */
+class MultiModelCompiler final : public ICompiler
+{
+public:
+  /**
+   * @brief     Construct a new Compiler object for NN package
+   * @param[in] nnpkg    NN package to compile
+   * @param[in] coptions Compiler option vector for each model in package
+   */
+  MultiModelCompiler(const std::shared_ptr<ir::NNPkg> &nnpkg,
+                     std::vector<std::unique_ptr<CompilerOptions>> &copts);
+
+  /**
+   * @brief Destroy the MultiModelCompiler object
+   */
+  ~MultiModelCompiler() = default;
+
+public:
+  /**
+   * @brief   Do compilation with the options
+   *
+   * @return std::shared_ptr<CompilerArtifact> Executors as a result of compilation
+   */
+  std::shared_ptr<CompilerArtifact> compile(void);
+
+private:
+  std::shared_ptr<ir::Graph> &primary_subgraph()
+  {
+    return _nnpkg->primary_model()->at(ir::SubgraphIndex{0});
+  }
+
+private:
+  std::shared_ptr<ir::NNPkg> _nnpkg;
+  std::vector<CompilerOptions *> _voptions;
+};
+
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__
diff --git a/runtime/onert/core/src/compiler/StaticShapeInferer.cc b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
index 485450560..25747d950 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInferer.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
@@ -18,6 +18,8 @@
 #include "util/ShapeInference.h"
 #include "util/logging.h"
 
+#include <misc/polymorphic_downcast.h>
+
 #include <sstream>
 #include <stdexcept>
 
@@ -188,6 +190,95 @@ void StaticShapeInferer::dump()
     });
 }
 
+std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>>
+StaticShapeInferer::createStaticShapeInferers(
+  const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<LoweredGraph>> &lowered_subgs)
+{
+  // Allocate StaticShapeInferer per each subgraph
+  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<StaticShapeInferer>> inferers;
+  for (auto &&pair : lowered_subgs)
+  {
+    const auto &subg_index = pair.first;
+    auto &lowered_subg = pair.second;
+    inferers[subg_index] = std::make_unique<StaticShapeInferer>(lowered_subg.get());
+  }
+
+  // Append observers in all StaticShapeInferers
+  for (auto &&pair : lowered_subgs)
+  {
+    const auto &subg_index = pair.first;
+    auto &lowered_subg = pair.second;
+
+    // TODO: Change this iteration for all to controlflow iteration
+    lowered_subg->graph().operations().iterate(
+      [&](const ir::OperationIndex &, const ir::Operation &op) {
+        // A Function to append child inferers. These make it possible for a StaticShapeInferer to
+        // call StaticShapeInferes of child subgraphs recursively
+        auto appendChildInferer = [&](const ir::SubgraphIndex &child_subg_idx) {
+          auto *child_inferer = inferers.at(child_subg_idx).get();
+          inferers.at(subg_index)->appendChildInferer(child_subg_idx, child_inferer);
+        };
+
+        // A Function to appaend subg input observers. This makes it possible for a
+        // StaticShapeInferer to update inputs of child subgraphs
+        auto appendSubgraphInputObserver = [&](const ir::SubgraphIndex &child_subg_idx) {
+          std::vector<ir::Operand *> child_subg_inputs;
+          auto &child_subg = lowered_subgs.at(child_subg_idx)->graph();
+          for (const auto &input_idx : child_subg.getInputs())
+          {
+            auto operand_ptr = child_subg.operands().getRawPtr(input_idx);
+            child_subg_inputs.emplace_back(operand_ptr);
+          }
+          inferers.at(subg_index)
+            ->appendSubgInputObserver(child_subg_idx,
+                                      std::make_unique<OperandObserver>(child_subg_inputs));
+        };
+
+        // A Function to set controlflow output observers. This makes it possible for a
+        // StaticShapeInferer to update outputs of parent controlflow opeerations
+        auto setControlFlowOutputObserver = [&](const ir::SubgraphIndex &child_subg_idx) {
+          std::vector<ir::Operand *> cf_outputs;
+          auto &subg = lowered_subg->graph();
+          for (const auto &output_idx : op.getOutputs())
+          {
+            auto operand_ptr = subg.operands().getRawPtr(output_idx);
+            cf_outputs.emplace_back(operand_ptr);
+          }
+          inferers.at(child_subg_idx)
+            ->setControlflowOutputObserver(std::make_unique<OperandObserver>(cf_outputs));
+        };
+
+        // Append Observers in a StaticShapeInferer
+        if (op.opcode() == ir::OpCode::If)
+        {
+          const auto &if_op = nnfw::misc::polymorphic_downcast<const ir::operation::If &>(op);
+
+          appendChildInferer(if_op.param().then_subg_index);
+          appendChildInferer(if_op.param().else_subg_index);
+
+          appendSubgraphInputObserver(if_op.param().then_subg_index);
+          appendSubgraphInputObserver(if_op.param().else_subg_index);
+
+          setControlFlowOutputObserver(if_op.param().then_subg_index);
+        }
+        else if (op.opcode() == ir::OpCode::While)
+        {
+          const auto &while_op = nnfw::misc::polymorphic_downcast<const ir::operation::While &>(op);
+
+          appendChildInferer(while_op.param().cond_subg_index);
+          appendChildInferer(while_op.param().body_subg_index);
+
+          appendSubgraphInputObserver(while_op.param().cond_subg_index);
+          appendSubgraphInputObserver(while_op.param().body_subg_index);
+
+          setControlFlowOutputObserver(while_op.param().body_subg_index);
+        }
+      });
+  }
+
+  return inferers;
+}
+
 void StaticShapeInferer::visit(const ir::operation::ArgMinMax &op)
 {
   auto &operands = _lowered_subg->graph().operands();
@@ -1306,8 +1397,11 @@ void StaticShapeInferer::visit(const ir::operation::Bulk &op)
   auto origin_output_shape = op.param().origin_output_shapes[0];
 
   // TODO: more check for valid batch request
-  assert(cur_input_shape.dim(0) >= origin_output_shape.dim(0));
-  assert(cur_input_shape.dim(0) % origin_output_shape.dim(0) == 0);
+  if ((cur_input_shape.dim(0) < origin_output_shape.dim(0)) ||
+      (cur_input_shape.dim(0) % origin_output_shape.dim(0) != 0))
+  {
+    throw std::runtime_error("StaticShapeInferer " + op.name() + ": Not supported batch size");
+  }
   size_t batch_multiplier = cur_input_shape.dim(0) / origin_output_shape.dim(0);
 
   ir::Shape new_shape;
diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h
index b3cc0bbe3..c7e06e84c 100644
--- a/runtime/onert/core/src/compiler/TensorRegistries.h
+++ b/runtime/onert/core/src/compiler/TensorRegistries.h
@@ -71,7 +71,7 @@ public:
 
   backend::ITensor *getITensor(ir::OperandIndex ind) const
   {
-    for (auto &tensor_reg : _tensor_regs)
+    for (auto &&tensor_reg : _tensor_regs)
     {
       auto tensor = tensor_reg->getITensor(ind);
       if (tensor)
diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
index f50fae0d3..e2b3f6111 100644
--- a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
+++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc
@@ -34,7 +34,7 @@ void OddOutputPass::run()
 
   VERBOSE(OddOutputPass) << "Case 1 : An operand which is a model output and a model input"
                          << std::endl;
-  for (auto &ind : outputs)
+  for (const auto &ind : outputs)
   {
     if (_graph.getInputs().contains(ind))
     {
@@ -46,7 +46,7 @@ void OddOutputPass::run()
 
   VERBOSE(OddOutputPass) << "Case 2 : Two or more duplicated outputs" << std::endl;
   std::unordered_set<ir::OperandIndex> occurence;
-  for (auto &ind : outputs)
+  for (auto &&ind : outputs)
   {
     auto &obj = _graph.operands().at(ind);
     if (occurence.count(ind) == 0)
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc
index 1be6d7794..2d11be201 100644
--- a/runtime/onert/core/src/compiler/pass/PassRunner.cc
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc
@@ -31,7 +31,7 @@ PassRunner &PassRunner::append(std::unique_ptr<Pass> pass)
 
 void PassRunner::run()
 {
-  for (auto &pass : _passes)
+  for (auto &&pass : _passes)
   {
     VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl;
     pass->run();
diff --git a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
index 71efa1bb5..0da1e54df 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
@@ -105,9 +105,9 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera
       }
     }
 
-    for (auto &operation : remove_list)
+    for (const auto &operation_index : remove_list)
     {
-      object.removeUse(operation);
+      object.removeUse(operation_index);
     }
   }
 }
diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc
index 9d1e06d6c..7d5b406ef 100644
--- a/runtime/onert/core/src/exec/Execution.cc
+++ b/runtime/onert/core/src/exec/Execution.cc
@@ -23,13 +23,12 @@ namespace onert
 namespace exec
 {
 
-Execution::Execution(const std::shared_ptr<Executors> &executors) : _executors{executors}
+Execution::Execution(const std::shared_ptr<IExecutors> &executors) : _executors{executors}
 {
   assert(executors != nullptr);
-  assert(executors->at(ir::SubgraphIndex{0}) != nullptr);
+  assert(executors->entryExecutor() != nullptr);
   _io_desc.inputs.resize(_executors->inputSize());
   _io_desc.outputs.resize(_executors->outputSize());
-  sem_init(&_async_io_descs_sem, 0, 1);
 }
 
 void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape)
@@ -70,80 +69,6 @@ void Execution::setInput(const ir::IOIndex &index, const void *buffer, size_t le
   _io_desc.inputs.at(index.value()) = std::make_unique<InputDesc>(info, buffer, length, layout);
 }
 
-void Execution::createNewAsyncDesc(uint32_t count)
-{
-  IODescription *_async_io_desc = new IODescription;
-  _async_io_desc->inputs.resize(primary_subgraph().getInputs().size());
-  _async_io_desc->outputs.resize(primary_subgraph().getOutputs().size());
-
-  _async_io_descs.push_back({_async_io_desc, count});
-}
-
-void Execution::setFinish() { finished = true; }
-
-bool Execution::isEmptyQueue()
-{
-  asyncIoDescSemWait();
-  bool ret = _async_io_descs.empty();
-  if (!ret)
-  {
-    for (uint32_t idx = 0; idx < _async_io_descs.front().first->inputs.size(); idx++)
-    {
-      if (_async_io_descs.front().first->inputs.at(idx).get() == nullptr)
-      {
-        ret = true;
-        break;
-      }
-    }
-  }
-  asyncIoDescSemPost();
-  return ret;
-}
-
-void Execution::executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length,
-                                  ir::Layout layout)
-{
-  const auto info = _executors->inputInfo(index);
-  IODescription *_async_io_desc = _async_io_descs.back().first;
-
-  {
-    auto input_shape_sig = _async_io_desc->dynamic_input_shapes.find(index);
-    auto size_required =
-      (input_shape_sig != _async_io_desc->dynamic_input_shapes.end())
-        ? input_shape_sig->second.num_elements() * onert::ir::sizeOfDataType(info.typeInfo().type())
-        : info.total_size();
-
-    if (length < size_required)
-    {
-      throw std::runtime_error{"Too small length"};
-    }
-  }
-  void *_buffer = (void *)malloc(length);
-  if (_buffer == NULL)
-  {
-    throw std::runtime_error{"malloc failed"};
-  }
-  memcpy(_buffer, buffer, length);
-
-  _async_io_desc->inputs.at(index.value()) =
-    std::make_unique<InputDesc>(info, _buffer, length, layout);
-}
-
-void Execution::executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length,
-                                   ir::Layout layout)
-{
-  const auto info = _executors->outputInfo(index);
-  IODescription *_async_io_desc = _async_io_descs.front().first;
-
-  if (length < info.total_size())
-  {
-    throw std::runtime_error{"Too small length"};
-  }
-
-  _async_io_desc->outputs.at(index.value()) =
-    std::make_unique<OutputDesc>(info, buffer, length, layout);
-}
-
 // TODO Remove default parameter
 void Execution::setInput(const ir::IOIndex &index, const ir::TypeInfo &type, const ir::Shape &shape,
                          const void *buffer, size_t length, ir::Layout layout)
@@ -209,18 +134,6 @@ void Execution::execute()
   VERBOSE(Execution) << "Execution finished" << std::endl;
 }
 
-void Execution::AsyncExecute()
-{
-  VERBOSE(Execution) << "Start Async execution" << std::endl;
-  if (_async_io_descs.empty())
-  {
-    VERBOSE(Execution) << "The input is not ready" << std::endl;
-    return;
-  }
-
-  primary_executor()->execute(*_async_io_descs.front().first);
-}
-
 void Execution::startExecute()
 {
   VERBOSE(Execution) << "Create asynchronous execution thread" << std::endl;
@@ -251,163 +164,21 @@ ir::Shape Execution::getInputShape(ir::IOIndex ind) const
   }
 }
 
+// NNAPI return fail if ANeuralNetworksExecution_getOutputOperandRank or
+// ANeuralNetworksExecution_getOutputOperandDimensions is called before execution.
+// On the other hand, NNFW API return static shape inference result if nnfw_output_tensorinfo is
+// called before execution.
+// To handle both case, this method retun static shape inference result and fail will be handled on
+// NNAPI frontend.
 ir::Shape Execution::getOutputShape(ir::IOIndex ind) const
 {
   if (!isFinished())
-    throw std::runtime_error("Cannot get output shape before execution is finished");
+    return _executors->outputInfo(ind).shape();
 
   const auto &output_desc = _io_desc.outputs.at(ind.value());
 
   return output_desc->info.shape();
 }
 
-void Execution::asyncIoDescSemWait() { sem_wait(&_async_io_descs_sem); }
-
-void Execution::asyncIoDescSemPost() { sem_post(&_async_io_descs_sem); }
-
-void Execution::runInference()
-{
-  uint32_t inference_cnt;
-  uint32_t output_sz = primary_subgraph().getOutputs().size();
-  while (true)
-  {
-    if (isEmptyQueue())
-    {
-      if (isFinished())
-      {
-        if (!next_exes.empty())
-        {
-          for (uint32_t i = 0; i < next_exes.size(); i++)
-          {
-            std::get<0>(next_exes[i])->setFinish();
-          }
-        }
-        else
-        {
-          sholudStop();
-        }
-        break;
-      }
-    }
-    else
-    {
-      for (uint32_t i = 0; i < output_sz; i++)
-      {
-        auto opidx = primary_subgraph().getOutputs().at(i);
-        auto shape = primary_subgraph().operands().at(opidx).shape();
-        auto dtype = primary_subgraph().operands().at(opidx).typeInfo().type();
-        auto rank = shape.rank();
-        uint32_t tensor_size = 1;
-        for (int32_t j = 0; j < rank; j++)
-        {
-          tensor_size *= shape.dim(j);
-        }
-        if (dtype == onert::ir::DataType::FLOAT32 || dtype == onert::ir::DataType::INT32 ||
-            dtype == onert::ir::DataType::UINT32)
-          tensor_size *= 4;
-        else if (dtype == onert::ir::DataType::INT64)
-          tensor_size *= 8;
-        void *_buffer = (void *)malloc(tensor_size);
-        if (_buffer == NULL)
-        {
-          throw std::runtime_error{"malloc failed"};
-        }
-        executeAsyncOutput(onert::ir::IOIndex(i), _buffer, tensor_size);
-      }
-      AsyncExecute();
-
-      // set inputs of next execution
-      auto _io_desc = getAsyncIoDescs()->front().first;
-      inference_cnt = getAsyncIoDescs()->front().second;
-      getAsyncIoDescs()->pop_front();
-
-      for (uint32_t i = 0; i < next_exes.size(); i++)
-      {
-        auto next_exe = std::get<0>(next_exes[i]);
-        auto o_index = std::get<1>(next_exes[i]);
-        auto i_index = std::get<2>(next_exes[i]);
-
-        next_exe->asyncIoDescSemWait();
-        auto next_io_descs = next_exe->getAsyncIoDescs();
-        bool exist = false;
-        for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++)
-        {
-          if (inference_cnt == iter->second)
-          {
-            exist = true;
-          }
-        }
-
-        if (!exist)
-        {
-          next_exe->createNewAsyncDesc(inference_cnt);
-        }
-        for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++)
-        {
-          if (inference_cnt == iter->second)
-          {
-            const auto input_index = next_exe->primary_subgraph().getInputs().at(i_index.value());
-            const auto info = next_exe->primary_subgraph().operands().at(input_index).info();
-
-            size_t length = _io_desc->outputs[o_index.value()]->size;
-            void *_buffer = (void *)malloc(length);
-            if (_buffer == NULL)
-            {
-              throw std::runtime_error{"malloc failed"};
-            }
-            memcpy(_buffer, _io_desc->outputs[o_index.value()]->buffer, length);
-
-            iter->first->inputs.at(i_index.value()) = std::make_unique<onert::exec::InputDesc>(
-              info, _buffer, length, onert::ir::Layout::NHWC);
-            break;
-          }
-        }
-        next_exe->asyncIoDescSemPost();
-      }
-
-      if (next_exes.empty())
-      {
-        std::vector<void *> results;
-        for (uint32_t i = 0; i < _io_desc->outputs.size(); i++)
-        {
-          size_t length = _io_desc->outputs[i]->size;
-          void *_buffer = (void *)malloc(length);
-          if (_buffer == NULL)
-          {
-            throw std::runtime_error{"malloc failed"};
-          }
-          memcpy(_buffer, _io_desc->outputs[i]->buffer, length);
-          results.push_back(_buffer);
-        }
-        _async_results.push_back(results);
-      }
-
-      for (uint32_t i = 0; i < _io_desc->inputs.size(); i++)
-      {
-        auto p = _io_desc->inputs.at(i).release();
-        if (p)
-        {
-          free((void *)p->buffer);
-          delete p;
-        }
-      }
-      for (uint32_t i = 0; i < _io_desc->outputs.size(); i++)
-      {
-        auto p = _io_desc->outputs.at(i).release();
-        if (p)
-        {
-          free(p->buffer);
-          delete p;
-        }
-      }
-      delete _io_desc;
-    }
-  }
-}
-
-bool Execution::stopWait(void) const { return stop_wait; }
-
-void Execution::sholudStop() { stop_wait = true; }
-
 } // namespace exec
 } // namespace onert
diff --git a/runtime/onert/core/src/exec/Execution.test.cc b/runtime/onert/core/src/exec/Execution.test.cc
index e3ea49470..fefe8a332 100644
--- a/runtime/onert/core/src/exec/Execution.test.cc
+++ b/runtime/onert/core/src/exec/Execution.test.cc
@@ -17,6 +17,7 @@
 #include "exec/Execution.h"
 
 #include "compiler/Compiler.h"
+#include "compiler/CompilerFactory.h"
 #include "ir/Graph.h"
 #include "ir/operation/BinaryArithmetic.h"
 #include "util/TracingCtx.h"
@@ -90,6 +91,161 @@ public:
   std::shared_ptr<onert::compiler::CompilerArtifact> artifact;
 };
 
+class CompiledMockUpMultiModel
+{
+public:
+  CompiledMockUpMultiModel()
+  {
+    // Model0: a float elementwise add operation
+    // Model0 input: lhs0, rhs0
+    // Model0 output: add result (result0)
+
+    // Model1: a qasymm8 elementwise add operation
+    // Model1 input: result0, rhs1
+    // Model1 output: add result (result1)
+
+    // Model2: a float elementwise add operation
+    // Model2 input: result0, result1
+    // Model2 output: add result (result2)
+
+    // constant: rhs2
+    // result0 <= (lhs0 + rhs0)
+    // result1 <= (result0 + rhs1)
+    // result2 <= (result0 + result1)
+    // lhs0, rhs0, rh1, result0, result1, result2 shape: {1, 2, 2, 1}
+    // activation: none (constant)
+
+    // Update edge information
+    edges.pkg_inputs.emplace_back(ModelIndex{0}, SubgraphIndex{0}, IOIndex{0});
+    edges.pkg_inputs.emplace_back(ModelIndex{0}, SubgraphIndex{0}, IOIndex{1});
+    edges.pkg_outputs.emplace_back(ModelIndex{2}, SubgraphIndex{0}, IOIndex{0});
+    // From
+    const auto result0 = IODesc{ModelIndex{0}, SubgraphIndex{0}, IOIndex{0}};
+    const auto result1 = IODesc{ModelIndex{1}, SubgraphIndex{0}, IOIndex{0}};
+    // To
+    const auto lhs1 = IODesc{ModelIndex{1}, SubgraphIndex{0}, IOIndex{0}};
+    const auto lhs2 = IODesc{ModelIndex{2}, SubgraphIndex{0}, IOIndex{0}};
+    const auto rhs2 = IODesc{ModelIndex{2}, SubgraphIndex{0}, IOIndex{1}};
+    edges.edges.insert({result0, lhs1});
+    edges.edges.insert({result0, lhs2});
+    edges.edges.insert({result1, rhs2});
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+      graphs.emplace_back(std::make_shared<Graph>());
+    }
+    Shape shape{1, 2, 2, 1};
+
+    // Model0's add operands (result1 <= lhs0 + rhs0)
+    DataType types[3] = {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM, DataType::FLOAT32};
+    auto operand_lhs0 = graphs[0]->addOperand(shape, TypeInfo{types[0]});
+    auto operand_rhs0 = graphs[0]->addOperand(shape, TypeInfo{types[0]});
+    auto operand_result0 = graphs[0]->addOperand(shape, TypeInfo{types[0]});
+
+    // Model0's add operation
+    operation::BinaryArithmetic::Param param0;
+    param0.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
+    param0.activation = Activation::NONE;
+    auto input_set0 = OperandIndexSequence{operand_lhs0, operand_rhs0};
+    auto output_set0 = OperandIndexSequence{operand_result0};
+    graphs[0]->addOperation(
+      std::make_unique<operation::BinaryArithmetic>(input_set0, output_set0, param0));
+
+    // Model0's inputs/outputs
+    graphs[0]->addInput(operand_lhs0);
+    graphs[0]->addInput(operand_rhs0);
+    graphs[0]->addOutput(operand_result0);
+    graphs[0]->verify();
+
+    // Model1's add operands (result2 <= Model0 result + rhs1)
+    // static float rhs1_data[4] = {3, 1, -1, 5};
+    static uint8_t rhs1_data[4] = {131, 129, 127, 133};
+    const float scale = 1;
+    const int32_t zero_point = 128;
+    auto operand_lhs1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point});
+    auto operand_rhs1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point});
+    auto operand_result1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point});
+    graphs[1]
+      ->operands()
+      .at(operand_rhs1)
+      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs1_data), 4));
+
+    // Model1's add operation
+    operation::BinaryArithmetic::Param param1;
+    param1.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
+    param1.activation = Activation::NONE;
+    auto input_set1 = OperandIndexSequence{operand_lhs1, operand_rhs1};
+    auto output_set1 = OperandIndexSequence{operand_result1};
+    graphs[1]->addOperation(
+      std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
+
+    // Model1's inputs/outputs
+    graphs[1]->addInput(operand_lhs1);
+    graphs[1]->addOutput(operand_result1);
+    graphs[1]->verify();
+
+    // Model2's additional operands (result3 <= Model0 result + Model1 result)
+    auto operand_lhs2 = graphs[2]->addOperand(shape, TypeInfo{types[2]});
+    auto operand_rhs2 = graphs[2]->addOperand(shape, TypeInfo{types[2]});
+    auto operand_result2 = graphs[2]->addOperand(shape, TypeInfo{types[2]});
+
+    // Model2's add operation
+    operation::BinaryArithmetic::Param param2;
+    param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
+    param2.activation = Activation::NONE;
+    auto input_set2 = OperandIndexSequence{operand_lhs2, operand_rhs2};
+    auto output_set2 = OperandIndexSequence{operand_result2};
+    graphs[2]->addOperation(
+      std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
+
+    // Model1's inputs/outputs
+    graphs[2]->addInput(operand_lhs2);
+    graphs[2]->addInput(operand_rhs2);
+    graphs[2]->addOutput(operand_result2);
+    graphs[2]->verify();
+
+    // Compile
+    compile();
+  }
+
+public:
+  void compile()
+  {
+    auto nnpkg = std::make_shared<onert::ir::NNPkg>();
+    coptions.clear();
+    for (uint16_t i = 0; i < graphs.size(); ++i)
+    {
+      coptions.emplace_back(onert::compiler::CompilerOptions::fromGlobalConfig());
+
+      auto model = std::make_shared<onert::ir::Model>();
+      model->push(SubgraphIndex{0}, graphs[i]);
+
+      nnpkg->push(onert::ir::ModelIndex{i}, std::move(model));
+    }
+    for (const auto &pkg_input : edges.pkg_inputs)
+    {
+      nnpkg->addInput(pkg_input);
+    }
+    for (const auto &pkg_output : edges.pkg_outputs)
+    {
+      nnpkg->addOutput(pkg_output);
+    }
+    for (const auto &edge : edges.edges)
+    {
+      nnpkg->addEdge(edge.from, edge.to);
+    }
+    auto compiler = onert::compiler::CompilerFactory::get().create(nnpkg, coptions);
+    nnpkg.reset();
+    artifact = compiler->compile();
+  }
+
+public:
+  std::vector<std::shared_ptr<Graph>> graphs;
+  std::vector<std::unique_ptr<onert::compiler::CompilerOptions>> coptions;
+  std::shared_ptr<onert::compiler::CompilerArtifact> artifact;
+  ModelEdges edges;
+};
+
 TEST(ExecInstance, simple)
 {
   auto mockup = CompiledMockUpModel();
@@ -209,7 +365,7 @@ class Inference
 {
 public:
   Inference(const float (&input1)[4], const float (&input2)[4], float (&output)[4],
-            std::shared_ptr<onert::exec::Executors> &executors)
+            std::shared_ptr<onert::exec::IExecutors> &executors)
     : _input1{input1}, _input2{input2}, _output{output}, _executors{executors}
   {
     // DO NOTHING
@@ -233,7 +389,7 @@ private:
   const float (&_input1)[4];
   const float (&_input2)[4];
   float (&_output)[4];
-  std::shared_ptr<onert::exec::Executors> &_executors;
+  std::shared_ptr<onert::exec::IExecutors> &_executors;
 };
 
 // Support multi-thread execution
@@ -299,4 +455,181 @@ TEST(ExecInstance, async)
   }
 }
 
+TEST(ExecInstance, multi_model_simple)
+{
+  auto mockup = CompiledMockUpMultiModel();
+  auto executors = mockup.artifact->_executors;
+
+  auto input1 = IOIndex{0};
+  auto input2 = IOIndex{1};
+  auto output = IOIndex{0};
+
+  const float input1_buffer[4] = {1, 0, -1, -2};
+  const float input2_buffer[4] = {1, -3, 2, -4};
+  float output_buffer[4] = {};
+  const float output_expected[4] = {7, -5, 1, -7};
+
+  onert::exec::Execution execution{executors};
+
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.execute();
+
+  for (auto i = 0; i < 4; i++)
+  {
+    EXPECT_EQ(output_buffer[i], output_expected[i]);
+  }
+}
+
+TEST(ExecInstance, multi_model_twoCompile)
+{
+  auto mockup = CompiledMockUpMultiModel();
+  auto executors1 = mockup.artifact->_executors;
+  onert::exec::Execution execution1{executors1};
+
+  auto input1 = IOIndex{0};
+  auto input2 = IOIndex{1};
+  auto output = IOIndex{0};
+
+  const float exe1_input1_buffer[4] = {1, 0, -1, -2};
+  const float exe1_input2_buffer[4] = {1, -3, 2, -4};
+  float exe1_output_buffer[4] = {};
+  const float exe1_output_expected[4] = {7, -5, 1, -7};
+
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+
+  // Make new executor: compile again
+  mockup.compile();
+  onert::exec::Execution execution2{mockup.artifact->_executors};
+
+  const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+  const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+  float exe2_output_buffer[4] = {};
+  const float exe2_output_expected[4] = {1, 9, -3, 9};
+
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+
+  execution1.execute();
+  execution2.execute();
+
+  for (auto i = 0; i < 4; i++)
+  {
+    EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+    EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+  }
+}
+
+// Support two initialized execution instance then ordered execution
+TEST(ExecInstance, multi_model_twoExecution)
+{
+  auto mockup = CompiledMockUpMultiModel();
+  auto executors = mockup.artifact->_executors;
+  auto input1 = IOIndex{0};
+  auto input2 = IOIndex{1};
+  auto output1 = IOIndex{0};
+
+  const float exe1_input1_buffer[4] = {1, 0, -1, -2};
+  const float exe1_input2_buffer[4] = {1, -3, 2, -4};
+  float exe1_output_buffer[4] = {};
+  const float exe1_output_expected[4] = {7, -5, 1, -7};
+  const float exe2_output_expected[4] = {1, 9, -3, 9};
+
+  onert::exec::Execution execution1{executors};
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+
+  const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+  const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+  float exe2_output_buffer[4] = {};
+
+  // Make new execution
+  onert::exec::Execution execution2{executors};
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+
+  execution1.execute();
+  execution1.execute();
+  execution2.execute();
+  execution2.execute();
+
+  for (auto i = 0; i < 4; i++)
+  {
+    EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+    EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+  }
+}
+
+// Multi-model is not thread-safe yet
+
+// Support asynchronous execution
+TEST(ExecInstance, multi_model_async)
+{
+  auto mockup = CompiledMockUpMultiModel();
+  auto executors = mockup.artifact->_executors;
+
+  auto input1 = IOIndex{0};
+  auto input2 = IOIndex{1};
+  auto output = IOIndex{0};
+
+  const float input1_buffer[4] = {1, 0, -1, -2};
+  const float input2_buffer[4] = {1, -3, 2, -4};
+  float output_buffer[4] = {};
+  const float output_expected[4] = {7, -5, 1, -7};
+
+  onert::exec::Execution execution{executors};
+
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.startExecute();
+  execution.waitFinish();
+
+  for (auto i = 0; i < 4; i++)
+  {
+    EXPECT_EQ(output_buffer[i], output_expected[i]);
+  }
+}
+
+TEST(ExecInstance, multi_model_dequant_input_quant_output)
+{
+  auto mockup = CompiledMockUpMultiModel();
+  auto executors = mockup.artifact->_executors;
+
+  auto input1 = IOIndex{0};
+  auto input2 = IOIndex{1};
+  auto output = IOIndex{0};
+
+  const uint8_t input1_buffer[4] = {138, 128, 118, 108}; // {1, 0, -1, -2}
+  const uint8_t input2_buffer[4] = {138, 98, 148, 88};   // {1, -3, 2, -4}
+  uint8_t output_buffer[4] = {};
+  const uint8_t output_expected[4] = {198, 78, 138, 58}; // {7, -5, 1, -7}
+  float scale = 0.1;
+  int32_t zero_point = 128;
+
+  onert::exec::Execution execution{executors};
+
+  onert::ir::TypeInfo type_info{onert::ir::DataType::QUANT_UINT8_ASYMM, scale, zero_point};
+  execution.setInput(input1, type_info, execution.getInputShape(input1),
+                     reinterpret_cast<const void *>(input1_buffer), 4, onert::ir::Layout::NHWC);
+  execution.setInput(input2, type_info, execution.getInputShape(input2),
+                     reinterpret_cast<const void *>(input2_buffer), 4, onert::ir::Layout::NHWC);
+  execution.setOutput(output, type_info, execution.getOutputShape(output),
+                      reinterpret_cast<void *>(output_buffer), 4, onert::ir::Layout::NHWC);
+  execution.execute();
+
+  for (auto i = 0; i < 4; i++)
+  {
+    EXPECT_EQ(output_buffer[i], output_expected[i]);
+  }
+}
+
+// TODO Add an unittest multi_model_quant_input_dequant_output
+
 } // namespace
diff --git a/runtime/onert/core/src/exec/ExecutionObservee.cc b/runtime/onert/core/src/exec/ExecutionObservee.cc
index d6a2bfd17..66610f0e0 100644
--- a/runtime/onert/core/src/exec/ExecutionObservee.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservee.cc
@@ -28,7 +28,7 @@ void ExecutionObservee::add(std::unique_ptr<IExecutionObserver> observer)
 
 void ExecutionObservee::notifySubgraphBegin(ir::SubgraphIndex ind)
 {
-  for (auto &o : _observers)
+  for (auto &&o : _observers)
   {
     o->handleSubgraphBegin(ind);
   }
@@ -36,7 +36,7 @@ void ExecutionObservee::notifySubgraphBegin(ir::SubgraphIndex ind)
 
 void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind)
 {
-  for (auto &o : _observers)
+  for (auto &&o : _observers)
   {
     o->handleSubgraphEnd(ind);
   }
@@ -45,7 +45,7 @@ void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind)
 void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex subg_ind,
                                        ir::OperationIndex op_ind, const backend::Backend *backend)
 {
-  for (auto &o : _observers)
+  for (auto &&o : _observers)
   {
     o->handleJobBegin(executor, subg_ind, op_ind, backend);
   }
@@ -54,7 +54,7 @@ void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex su
 void ExecutionObservee::notifyJobEnd(IExecutor *executor, ir::SubgraphIndex subg_ind,
                                      ir::OperationIndex op_ind, const backend::Backend *backend)
 {
-  for (auto &o : _observers)
+  for (auto &&o : _observers)
   {
     o->handleJobEnd(executor, subg_ind, op_ind, backend);
   }
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h
index 1aadac2f5..91fbac323 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.h
+++ b/runtime/onert/core/src/exec/ExecutionObservers.h
@@ -22,7 +22,7 @@
 #include "../util/EventRecorder.h"
 #include "../util/EventWriter.h"
 
-#include "exec/Executors.h"
+#include "exec/IExecutor.h"
 #include "ir/Index.h"
 #include "ir/Operation.h"
 #include "util/ITimer.h"
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index d2d204a0b..515cf8e48 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -29,8 +29,8 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra
                            backend::BackendContexts &&backend_contexts,
                            const compiler::TensorRegistries &tensor_regs,
                            const util::TracingCtx *tracing_ctx)
-  : _lowered_graph{std::move(lowered_graph)}, _backend_contexts{std::move(backend_contexts)},
-    _graph{_lowered_graph->graph()}, _parent_graph{_lowered_graph->parent_graph()}, _mutex(),
+  : _lowered_graph{std::move(lowered_graph)},
+    _backend_contexts{std::move(backend_contexts)}, _graph{_lowered_graph->graph()}, _mutex(),
     _tracing_ctx(tracing_ctx)
 {
   auto build_tensor_list = [&](const auto &ind_seq, auto &tensors) {
@@ -120,9 +120,27 @@ void ExecutorBase::execute(const IODescription &desc)
     {
       tensor->set_dynamic();
       tensor->setShape(input_shape->second);
+      /*
+       * Changes tensor shape and allocate memory since its shape was changed
+       * perhaps by nnfw_set_input_tensorinfo()
+       *
+       * Cases are:
+       * 1) static operand -> nnfw_set_input_tensorinfo() -> execute() -> execute()
+       *                                                 (a)          (b)
+       *
+       * at (a), operand is static, tensor is static - memory dealloc is not needed
+       *   (DynamicTensorManager cannot dealloc memory allocated by StaticTensorManager)
+       * at (b), operand is static, tensor is dynamic - memory dealloc is needed
+       *
+       * 2) dynamic operand -> nnfw_set_input_tensorinfo() -> execute() -> execute()
+       *                                                  (a)          (b)
+       *
+       * at (a), operand is dynamic, tensor is dynamic - memory dealloc is not needed
+       *                                       since it has not been allocated yet
+       * at (b), operand is dynamic, tensor is dynamic - memory dealloc is needed
+       */
+      tensor->applyShape(input_shape->second);
     }
-
-    handleDynamicInputTensor(ir::IOIndex{i}, desc);
   }
 
   assert(_output_tensors.size() == desc.outputs.size());
@@ -156,38 +174,9 @@ void ExecutorBase::execute(const IODescription &desc)
   }
 }
 
-/**
- * @brief Changes tensor shape and allocate memory
- *        if input shape was changed by nnfw_set_input_tensorinfo()
- *
- * @note  Cases are:
- *        1) static operand -> nnfw_set_input_tensorinfo() -> execute() -> execute()
- *                                                        (a)          (b)
- *
- *           at (a), operand is static, tensor is static - memory dealloc is not needed
- *                   (DynamicTensorManager cannot dealloc memory allocated by StaticTensorManager)
- *           at (b), operand is static, tensor is dynamic - memory dealloc is needed
- *
- *        2) dynamic operand -> nnfw_set_input_tensorinfo() -> execute() -> execute()
- *                                                         (a)          (b)
- *
- *           at (a), operand is dynamic, tensor is dynamic - memory dealloc is not needed
- *                                                           since it has not been allocated yet
- *           at (b), operand is dynamic, tensor is dynamic - memory dealloc is needed
- */
-void ExecutorBase::handleDynamicInputTensor(ir::IOIndex io_ind, const IODescription &desc)
-{
-  auto shape_sig_found = desc.dynamic_input_shapes.find(io_ind);
-  if (shape_sig_found != desc.dynamic_input_shapes.end())
-  {
-    auto changed_input_shape = shape_sig_found->second;
-    _input_tensors[io_ind.value()]->applyShape(changed_input_shape);
-  }
-}
-
 bool ExecutorBase::hasDynamicInput()
 {
-  for (auto &tensor : _input_tensors)
+  for (auto &&tensor : _input_tensors)
   {
     if (tensor->is_dynamic())
       return true;
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index e4f914546..7aee3d9ee 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -51,9 +51,7 @@ public:
 
   virtual ~ExecutorBase() = default;
 
-  const ir::Graph &graph() final { return _graph; }
-
-  const ir::Graph &parent_graph() final { return _parent_graph; }
+  const ir::Graph &graph() const final { return _graph; }
 
   void execute(const IODescription &desc) final;
 
@@ -70,6 +68,11 @@ public:
 
   void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); };
 
+  const std::vector<backend::builtin::IOTensor *> &getInputTensors() const override
+  {
+    return _input_tensors;
+  }
+
   const std::vector<backend::builtin::IOTensor *> &getOutputTensors() const override
   {
     return _output_tensors;
@@ -87,14 +90,10 @@ protected:
   std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
   backend::BackendContexts _backend_contexts;
   const ir::Graph &_graph;
-  const ir::Graph &_parent_graph;
   std::vector<backend::builtin::IOTensor *> _input_tensors;
   std::vector<backend::builtin::IOTensor *> _output_tensors;
   std::mutex _mutex;
   const util::TracingCtx *_tracing_ctx;
-
-private:
-  void handleDynamicInputTensor(ir::IOIndex input_index, const IODescription &desc);
 };
 
 } // namespace exec
diff --git a/runtime/onert/core/src/exec/Executors.cc b/runtime/onert/core/src/exec/Executors.cc
index e0ee24fea..3f4b3cc7f 100644
--- a/runtime/onert/core/src/exec/Executors.cc
+++ b/runtime/onert/core/src/exec/Executors.cc
@@ -14,170 +14,628 @@
  * limitations under the License.
  */
 
-#include "exec/Executors.h"
+#include "Executors.h"
 
-namespace onert
-{
-namespace exec
+#include "../backend/builtin/IOTensor.h"
+
+namespace
 {
 
-uint32_t Executors::inputSize() const
+using namespace onert;
+
+int32_t find_input_index(const std::vector<ir::IODesc> &pkg_inputs,
+                         const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+                         const ir::IOIndex &io_index)
 {
-  return _model_edges ? _model_edges->pkg_inputs.size()
-                      : _executors.at(ir::SubgraphIndex{0})->graph().getInputs().size();
+  for (size_t i = 0; i < pkg_inputs.size(); i++)
+  {
+    auto &input_desc = pkg_inputs[i];
+    if ((std::get<ir::ModelIndex>(input_desc) == model_index) &&
+        (std::get<ir::SubgraphIndex>(input_desc) == subg_index) &&
+        (std::get<ir::IOIndex>(input_desc) == io_index))
+      return static_cast<int32_t>(i);
+  }
+  return -1;
 }
 
-uint32_t Executors::outputSize() const
+int32_t find_output_index(const std::vector<ir::IODesc> &pkg_outputs,
+                          const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+                          const ir::IOIndex &io_index)
 {
-  return _model_edges ? _model_edges->pkg_outputs.size()
-                      : _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().size();
+  for (size_t i = 0; i < pkg_outputs.size(); i++)
+  {
+    auto &input_desc = pkg_outputs[i];
+    if ((std::get<ir::ModelIndex>(input_desc) == model_index) &&
+        (std::get<ir::SubgraphIndex>(input_desc) == subg_index) &&
+        (std::get<ir::IOIndex>(input_desc) == io_index))
+      return static_cast<int32_t>(i);
+  }
+  return -1;
 }
 
-const ir::OperandInfo Executors::inputInfo(const ir::IOIndex &index)
+} // namespace
+
+namespace onert
+{
+namespace exec
+{
+
+class Executors::EdgeTensor : public backend::builtin::IOTensor
 {
-  if (_model_edges)
+public:
+  EdgeTensor(const ir::OperandInfo &info, ir::Layout layout)
+    : backend::builtin::IOTensor(info, layout), _buffer{nullptr}, _ref_count{0}
   {
-    // Assume that each model may have only one subgraph
-    // TODO handle general case
-    const auto desc = _model_edges->pkg_inputs[index.value()];
-    const auto model_idx = std::get<0>(desc);
-    const auto executor_idx = ir::SubgraphIndex{model_idx.value()};
-    const auto input_index = _executors.at(executor_idx)->graph().getInputs().at(std::get<2>(desc));
-    return _executors.at(executor_idx)->graph().operands().at(input_index).info();
   }
+  ~EdgeTensor() = default;
 
-  const auto input_index = _executors.at(ir::SubgraphIndex{0})->graph().getInputs().at(index);
-  return _executors.at(ir::SubgraphIndex{0})->graph().operands().at(input_index).info();
-}
+  void allocate_buffer()
+  {
+    const auto total_size = orig_info().total_size();
+    _buffer = std::make_unique<uint8_t[]>(total_size);
+    _ref_count = 1;
 
-const ir::OperandInfo Executors::outputInfo(const ir::IOIndex &index)
-{
-  if (_model_edges)
+    // NOTE Executor's inputs/outputs are always IPortableTensor. If backend of inputs/outputs
+    //      is using tensor that does not inherit IPortableTensor, Permute operation is added
+    //      and all inputs/outputs become IPortableTensor at compile stage.
+    //      This allows user's buffers to be set to inputs/outputs of executors.
+    setUserTensor(_buffer.get(), total_size);
+  }
+
+  void increase_ref() { _ref_count++; }
+
+  void decrease_ref()
   {
-    // Assume that each model may have only one subgraph
-    // TODO handle general case
-    auto desc = _model_edges->pkg_outputs[index.value()];
-    auto model_idx = std::get<0>(desc);
-    auto executor_idx = ir::SubgraphIndex{model_idx.value()};
-    auto output_index = _executors.at(executor_idx)->graph().getOutputs().at(std::get<2>(desc));
-    return _executors.at(executor_idx)->graph().operands().at(output_index).info();
+    assert(_ref_count > 0);
+    _ref_count--;
+    if (_ref_count == 0)
+    {
+      _buffer.reset();
+      setUserTensor(nullptr, orig_info().total_size());
+    }
   }
 
-  auto output_index = _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().at(index);
-  return _executors.at(ir::SubgraphIndex{0})->graph().operands().at(output_index).info();
+private:
+  std::unique_ptr<uint8_t[]> _buffer;
+  int32_t _ref_count;
+};
+
+void Executors::emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+                        std::unique_ptr<IExecutor> exec)
+{
+  _executors.emplace(std::make_pair(model_index, subg_index), std::move(exec));
 }
 
-void Executors::execute(const IODescription &desc)
+IExecutor *Executors::at(const ir::ModelIndex &model_index,
+                         const ir::SubgraphIndex &subg_index) const
+{
+  return _executors.at(std::make_pair(model_index, subg_index)).get();
+}
+
+uint32_t Executors::inputSize() const { return _model_edges->pkg_inputs.size(); }
+
+uint32_t Executors::outputSize() const { return _model_edges->pkg_outputs.size(); }
+
+const ir::OperandInfo &Executors::inputInfo(const ir::IOIndex &index) const
 {
-  if (_model_edges)
-    return executeEntries(desc);
+  auto const desc = _model_edges->pkg_inputs[index.value()];
+  auto const model_index = std::get<0>(desc);
+  auto const subg_index = std::get<1>(desc);
+  auto const io_index = std::get<2>(desc);
+  auto const executor = at(model_index, subg_index);
+  return executor->getInputTensors().at(io_index.value())->orig_info();
+}
 
-  _executors.at(ir::SubgraphIndex{0})->execute(desc);
+const ir::OperandInfo &Executors::outputInfo(const ir::IOIndex &index) const
+{
+  auto const desc = _model_edges->pkg_outputs[index.value()];
+  auto const model_index = std::get<0>(desc);
+  auto const subg_index = std::get<1>(desc);
+  auto const io_index = std::get<2>(desc);
+  auto const executor = at(model_index, subg_index);
+  return executor->getOutputTensors().at(io_index.value())->orig_info();
 }
 
-void Executors::executeEntries(const IODescription &desc)
+// Allow below edges only
+//  m1 < m2, s1 == 0 and s2 == 0 if m1:s1:o1 -> m2:s2:o2'
+void Executors::checkSupportedMultimodel() const
 {
-  // Assume 2 executors only
-  // Assume that each model may have only one subgraph
-  // TODO Support general case
-  if (_executors.size() != 2)
-    throw std::runtime_error{"NYI: Multi model execution for this package is not supported yet"};
+  // If package includes no-connection model, model_count is less than real model count in package.
+  // Then this method will throw exception based on model index
+  //  1st model: input assumption
+  //  Otherwise: edges assumption
 
-  // Assume all edges are 0:0:x -> 1:0:x
+  // Assumption: edges
+  // m1 < m2, s1 == 0 and s2 == 0 if edge 'm1:s1:o1 -> m2:s2:o2'
   for (auto edge : _model_edges->edges)
   {
-    if ((std::get<ir::ModelIndex>(edge.from) != ir::ModelIndex{0}) ||
-        (std::get<ir::ModelIndex>(edge.to) != ir::ModelIndex{1}) ||
-        (std::get<ir::SubgraphIndex>(edge.from) != ir::SubgraphIndex{0}) ||
-        (std::get<ir::SubgraphIndex>(edge.to) != ir::SubgraphIndex{0}) ||
-        (std::get<ir::IOIndex>(edge.from) != std::get<ir::IOIndex>(edge.to)))
-      throw std::runtime_error{"NYI: Multi model execution for this edge is not supported yet"};
+    auto const model_from = std::get<ir::ModelIndex>(edge.from);
+    auto const model_to = std::get<ir::ModelIndex>(edge.to);
+    auto const subg_from = std::get<ir::SubgraphIndex>(edge.from);
+    auto const subg_to = std::get<ir::SubgraphIndex>(edge.to);
+
+    if (model_from.value() == model_to.value())
+    {
+      throw std::runtime_error{"Multi model's edge set has invalid edge"};
+    }
+
+    if ((model_from.value() > model_to.value()) || (subg_from != ir::SubgraphIndex{0}) ||
+        (subg_to != ir::SubgraphIndex{0}))
+      throw std::runtime_error{"NYI: Multi model execution for this edge set is not supported yet"};
   }
 
-  // Assume all package inputs are 0:0:x
-  for (uint32_t i = 0; i < _model_edges->pkg_inputs.size(); i++)
+  // Assumption: package inputs
+  //  All 1st model inputs come from package input if always m1 < m2
   {
-    auto input = _model_edges->pkg_inputs[i];
-    if ((std::get<ir::ModelIndex>(input) != ir::ModelIndex{0}) ||
-        (std::get<ir::SubgraphIndex>(input) != ir::SubgraphIndex{0}) ||
-        (std::get<ir::IOIndex>(input) != ir::IOIndex{i}))
+    auto first_executor = at(ir::ModelIndex{0}, ir::SubgraphIndex{0});
+    auto search_first_model = [&](const ir::IOIndex &input_index) {
+      for (const auto &input : _model_edges->pkg_inputs)
+      {
+        if ((std::get<ir::ModelIndex>(input) == ir::ModelIndex{0}) ||
+            (std::get<ir::SubgraphIndex>(input) == ir::SubgraphIndex{0}) ||
+            (std::get<ir::IOIndex>(input) == input_index))
+          return true;
+      }
+
+      return false;
+    };
+
+    for (uint32_t i = 0; i < first_executor->getInputTensors().size(); i++)
     {
-      throw std::runtime_error{"NYI: Support package input to 1st model with same order"};
+      if (!search_first_model(ir::IOIndex{i}))
+        throw std::runtime_error{"Cannot find 1st model's input buffer"};
     }
   }
 
-  // Assume all package outputs are 1:0:x
-  for (uint32_t i = 0; i < _model_edges->pkg_outputs.size(); i++)
+  // Check whether nnpkg outputs and Edge `from` are duplicated
+  for (const auto &edge : _model_edges->edges)
   {
-    auto output = _model_edges->pkg_outputs[i];
-    if ((std::get<ir::ModelIndex>(output) != ir::ModelIndex{1}) ||
-        (std::get<ir::SubgraphIndex>(output) != ir::SubgraphIndex{0}) ||
-        (std::get<ir::IOIndex>(output) != ir::IOIndex{i}))
+    if (std::find(_model_edges->pkg_outputs.begin(), _model_edges->pkg_outputs.end(), edge.from) !=
+        _model_edges->pkg_outputs.end())
     {
-      throw std::runtime_error{"NYI: Support package output from 2nd model with same order"};
+      throw std::runtime_error{"Multi model execution does not support duplicating nnpkg outputs "
+                               "with `from` of edges yet"};
     }
   }
+}
+
+void Executors::createEdgeQuantLayers()
+{
+  if (_is_created_edge_quant_layers)
+  {
+    return;
+  }
 
-  const auto &executor1 = _executors.at(ir::SubgraphIndex{0});
-  const auto &graph1 = executor1->graph();
-  const auto &executor2 = _executors.at(ir::SubgraphIndex{1});
-  const auto &graph2 = executor2->graph();
+  // Create EdgeTensor for edges between executors
+  for (const auto &pair : _edge_map)
+  {
+    const auto &from_iodesc = pair.first;
+    const auto &from_model_index = std::get<ir::ModelIndex>(from_iodesc);
+    const auto &from_subg_index = std::get<ir::SubgraphIndex>(from_iodesc);
+    const auto &from_io_index = std::get<ir::IOIndex>(from_iodesc);
+
+    const auto from_executor = _executors.at({from_model_index, from_subg_index}).get();
+    const auto from_tensor = from_executor->getOutputTensors().at(from_io_index.value());
+
+    const auto &from_info = from_tensor->orig_info();
+    const auto from_layout = from_tensor->orig_layout();
+    _edge_tensors[from_iodesc] = std::make_unique<EdgeTensor>(from_info, from_layout);
+  }
 
-  if ((graph1.getInputs().size() != _model_edges->pkg_inputs.size()) ||
-      (graph2.getOutputs().size() != _model_edges->pkg_outputs.size()) ||
-      (graph1.getOutputs().size() != graph2.getInputs().size()) ||
-      (graph1.getOutputs().size() != _model_edges->edges.size()))
+  // Append type-aware quantization layer for edges between executors
+  for (const auto &executor_pair : _executors)
   {
-    throw std::runtime_error{"NYI: Unsupported model edge pattern"};
+    const auto &executor_index = executor_pair.first;
+    const auto &model_index = executor_index.first;
+    const auto &subg_index = executor_index.second;
+
+    std::vector<backend::ITensor *> inputs;
+    std::vector<backend::ITensor *> outputs;
+    for (const auto &pair : _edge_map)
+    {
+      const auto &from_iodesc = pair.first;
+      if (std::get<ir::ModelIndex>(from_iodesc) == model_index &&
+          std::get<ir::SubgraphIndex>(from_iodesc) == subg_index)
+      {
+        const auto from_tensor = _edge_tensors[from_iodesc].get();
+        const auto &to_list = pair.second;
+
+        for (const auto &to_iodesc : to_list)
+        {
+          const auto &to_model_index = std::get<ir::ModelIndex>(to_iodesc);
+          const auto &to_subg_index = std::get<ir::SubgraphIndex>(to_iodesc);
+          const auto &to_io_index = std::get<ir::IOIndex>(to_iodesc);
+
+          const auto to_executor = _executors.at({to_model_index, to_subg_index}).get();
+          const auto to_tensor = to_executor->getInputTensors().at(to_io_index.value());
+
+          // TODO Unify tensors with the same `from` tensor and same type
+          if (from_tensor->data_type() != to_tensor->data_type())
+          {
+            assert(inputs.size() == outputs.size());
+            const auto &to_info =
+              to_executor->getInputTensors().at(to_io_index.value())->orig_info();
+            const auto to_layout = to_tensor->orig_layout();
+            inputs.emplace_back(from_tensor);
+
+            auto type_aware_quant_tensor = std::make_unique<EdgeTensor>(to_info, to_layout);
+            outputs.emplace_back(type_aware_quant_tensor.get());
+
+            _edge_quant_tensors[to_iodesc] = std::move(type_aware_quant_tensor);
+          }
+        }
+      }
+    }
+
+    auto layer = std::make_unique<PermuteLayer>(inputs, outputs);
+    layer->prepare();
+    _edge_quant_layers[{model_index, subg_index}] = std::move(layer);
   }
 
-  // Prepare buffer
-  // Assume buffer layout is NHWC
-  std::vector<std::unique_ptr<uint8_t[]>> bufs(_model_edges->edges.size());
-  std::vector<const ir::OperandInfo *> buf_infos(_model_edges->edges.size());
-  const auto layout = ir::Layout::NHWC;
+  _is_created_edge_quant_layers = true;
+}
 
-  for (uint32_t i = 0; i < graph1.getOutputs().size(); i++)
+void Executors::CreatePkgIOTensors(const IODescription &desc)
+{
+  for (const auto &pkg_input : _model_edges->pkg_inputs)
   {
-    const auto buf_index =
-      _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().at(ir::IOIndex{i});
-    buf_infos[i] = &_executors.at(ir::SubgraphIndex{0})->graph().operands().at(buf_index).info();
-    const auto buf_size = buf_infos[i]->total_size();
-    bufs[i] = std::make_unique<uint8_t[]>(buf_size);
+    // Create IOTensor for nnpkg inputs
+    const auto &model_index = std::get<ir::ModelIndex>(pkg_input);
+    const auto &subg_index = std::get<ir::SubgraphIndex>(pkg_input);
+    const auto &io_index = std::get<ir::IOIndex>(pkg_input);
+    const auto input_pkg_index =
+      find_input_index(_model_edges->pkg_inputs, model_index, subg_index, io_index);
+    auto input_desc = desc.inputs[input_pkg_index].get();
+    _pkg_input_tensors[pkg_input] =
+      std::make_unique<backend::builtin::IOTensor>(input_desc->info, input_desc->layout);
   }
 
-  // 1st executor
+  for (const auto &pkg_output : _model_edges->pkg_outputs)
   {
-    IODescription desc1;
-    const auto input_size = graph1.getInputs().size();
-    const auto output_size = graph1.getOutputs().size();
-    desc1.inputs.resize(input_size);
-    desc1.outputs.resize(output_size);
-    for (uint32_t i = 0; i < input_size; i++)
-      desc1.inputs[i] = std::make_unique<InputDesc>(*desc.inputs[i].get());
-    for (uint32_t i = 0; i < output_size; i++)
-      desc1.outputs[i] = std::make_unique<OutputDesc>(*buf_infos[i], bufs[i].get(),
-                                                      buf_infos[i]->total_size(), layout);
+    // Create IOTensor for nnpkg outputs
+    const auto &model_index = std::get<ir::ModelIndex>(pkg_output);
+    const auto &subg_index = std::get<ir::SubgraphIndex>(pkg_output);
+    const auto &io_index = std::get<ir::IOIndex>(pkg_output);
+    const auto output_pkg_index =
+      find_output_index(_model_edges->pkg_outputs, model_index, subg_index, io_index);
+    auto output_desc = desc.outputs[output_pkg_index].get();
+    _pkg_output_tensors[pkg_output] =
+      std::make_unique<backend::builtin::IOTensor>(output_desc->info, output_desc->layout);
+  }
+}
 
-    executor1->execute(desc1);
+void Executors::createPkgIOQuantLayers(const IODescription &desc)
+{
+  // Append type-aware quantization layer for nnpkg inputs/outputs between executors
+  for (const auto &pair : _executors)
+  {
+    const auto &executor_index = pair.first;
+    const auto &model_index = executor_index.first;
+    const auto &subg_index = executor_index.second;
+    const auto executor = pair.second.get();
+
+    // Find pkg inputs of current executor
+    std::vector<ir::IODesc> pkg_inputs;
+    for (const auto &pkg_input : _model_edges->pkg_inputs)
+    {
+      if (std::get<ir::ModelIndex>(pkg_input) == model_index &&
+          std::get<ir::SubgraphIndex>(pkg_input) == subg_index)
+      {
+        pkg_inputs.emplace_back(pkg_input);
+      }
+    }
+    std::vector<backend::ITensor *> src_tensors;
+    std::vector<backend::ITensor *> dst_tensors;
+    for (const auto &pkg_input : pkg_inputs)
+    {
+      const auto &io_index = std::get<ir::IOIndex>(pkg_input);
+      const auto input_pkg_index =
+        find_input_index(_model_edges->pkg_inputs, model_index, subg_index, io_index);
+      auto input_desc = desc.inputs[input_pkg_index].get();
+
+      // Create EdgeTensor for nnpkg input if type is different
+      const auto input_tensor =
+        executor->getInputTensors().at(std::get<ir::IOIndex>(pkg_input).value());
+      const auto &orig_info = input_tensor->orig_info();
+      if (input_desc->info.typeInfo().type() != input_tensor->orig_info().typeInfo().type())
+      {
+        const auto orig_layout = input_tensor->orig_layout();
+        auto pkg_input_edge_tensor = std::make_unique<EdgeTensor>(orig_info, orig_layout);
+        _pkg_input_quant_tensors[pkg_input] = std::move(pkg_input_edge_tensor);
+
+        // Append type-aware quantization layer's inputs/outputs
+        src_tensors.emplace_back(_pkg_input_tensors[pkg_input].get());
+        dst_tensors.emplace_back(_pkg_input_quant_tensors[pkg_input].get());
+      }
+    }
+
+    // Create type-aware quantization layer for nnpkg inputs
+    auto pkg_input_layer = std::make_unique<PermuteLayer>(src_tensors, dst_tensors);
+    pkg_input_layer->prepare();
+    _pkg_input_quant_layers[{model_index, subg_index}] = std::move(pkg_input_layer);
+
+    // Find pkg outputs of current executor
+    std::vector<ir::IODesc> pkg_outputs;
+    for (const auto &pkg_output : _model_edges->pkg_outputs)
+    {
+      if (std::get<ir::ModelIndex>(pkg_output) == model_index &&
+          std::get<ir::SubgraphIndex>(pkg_output) == subg_index)
+      {
+        pkg_outputs.emplace_back(pkg_output);
+      }
+    }
+    src_tensors.clear();
+    dst_tensors.clear();
+    // Create Tensors of nnpkg outputs for type-aware quantization
+    for (const auto &pkg_output : pkg_outputs)
+    {
+      const auto &io_index = std::get<ir::IOIndex>(pkg_output);
+      const auto output_pkg_index =
+        find_output_index(_model_edges->pkg_outputs, model_index, subg_index, io_index);
+      auto output_desc = desc.outputs[output_pkg_index].get();
+
+      // Create EdgeTensor for nnpkg output if type is different
+      const auto output_tensor =
+        executor->getOutputTensors().at(std::get<ir::IOIndex>(pkg_output).value());
+      const auto &orig_info = output_tensor->orig_info();
+      if (output_desc->info.typeInfo().type() != output_tensor->orig_info().typeInfo().type())
+      {
+        const auto orig_layout = output_tensor->orig_layout();
+        auto pkg_output_edge_tensor = std::make_unique<EdgeTensor>(orig_info, orig_layout);
+        _pkg_output_quant_tensors[pkg_output] = std::move(pkg_output_edge_tensor);
+
+        // Append type-aware quantization layer's inputs/outputs
+        src_tensors.emplace_back(_pkg_output_quant_tensors[pkg_output].get());
+        dst_tensors.emplace_back(_pkg_output_tensors[pkg_output].get());
+      }
+    }
+
+    // Create type-aware quantization layer for nnpkg outputs
+    auto pkg_output_layer = std::make_unique<PermuteLayer>(src_tensors, dst_tensors);
+    pkg_output_layer->prepare();
+    _pkg_output_quant_layers[{model_index, subg_index}] = std::move(pkg_output_layer);
   }
+}
+
+void Executors::execute(const IODescription &desc)
+{
+  // Check supported multi model package
+  checkSupportedMultimodel();
+
+  // TODO Move creating type-aware quantization layers for edges in compilation stage
+  createEdgeQuantLayers();
+
+  // TODO Create IOTensors only once and recreate them only if nnpkg info changes
+  CreatePkgIOTensors(desc);
+
+  // TODO Create type-aware quantization layers only once and recreate them only if type changes
+  createPkgIOQuantLayers(desc);
 
-  // 2nd executor
+  // TODO Find better way to schedule order of executors
+  auto const model_count = modelCount();
+
+  auto find_from = [&](const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+                       const ir::IOIndex &io_index) {
+    for (const auto &edge : _model_edges->edges)
+    {
+      if ((std::get<ir::ModelIndex>(edge.to) == model_index) &&
+          (std::get<ir::SubgraphIndex>(edge.to) == subg_index) &&
+          (std::get<ir::IOIndex>(edge.to) == io_index))
+        return edge.from;
+    }
+
+    throw std::runtime_error{"Cannot find edge for model input"};
+  };
+
+  // Execute each model
+  // NOTE May be better to use vector instead of unordered_map for _executors
+  for (auto model_index = ir::ModelIndex{0}; model_index.value() < model_count; model_index++)
   {
-    IODescription desc2;
-    const auto input_size = graph2.getInputs().size();
-    const auto output_size = graph2.getOutputs().size();
-    desc2.inputs.resize(input_size);
-    desc2.outputs.resize(output_size);
+    // Find executor
+    auto executor = at(model_index, ir::SubgraphIndex{0});
+
+    // Set IOTensors
+    // TODO Set internal IOTensors only once
+    std::vector<backend::IPortableTensor *> inputs_inter;
+    std::vector<backend::IPortableTensor *> outputs_inter;
+    const auto &input_tensors = executor->getInputTensors();
+    const auto &output_tensors = executor->getOutputTensors();
+    auto const input_size = input_tensors.size();
+    auto const output_size = output_tensors.size();
+    inputs_inter.resize(input_size);
+    outputs_inter.resize(output_size);
+
+    // Set inputs of executor
+    // TODO Create layer to allocate/deallocate buffers of EdgeTensor for each executor
     for (uint32_t i = 0; i < input_size; i++)
-      desc2.inputs[i] = std::make_unique<InputDesc>(*buf_infos[i], bufs[i].get(),
-                                                    buf_infos[i]->total_size(), layout);
+    {
+      const auto input_pkg_index = find_input_index(_model_edges->pkg_inputs, model_index,
+                                                    ir::SubgraphIndex{0}, ir::IOIndex{i});
+      const auto input_io_desc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+      if (input_pkg_index != -1)
+      {
+        // Allocate type-aware quantization tensors for nnpkg inputs and set internal tensors
+        if (_pkg_input_quant_tensors.find(input_io_desc) != _pkg_input_quant_tensors.end())
+        {
+          _pkg_input_quant_tensors[input_io_desc]->allocate_buffer();
+
+          inputs_inter[i] = _pkg_input_quant_tensors[input_io_desc].get();
+        }
+        else
+        {
+          inputs_inter[i] = _pkg_input_tensors[input_io_desc].get();
+        }
+
+        // Set buffer of IOTensor
+        auto input_desc = desc.inputs[input_pkg_index].get();
+        // TODO Remove const_cast (we need const_cast as ITensor is writable)
+        _pkg_input_tensors[input_io_desc]->setUserTensor(
+          reinterpret_cast<uint8_t *>(const_cast<void *>(input_desc->buffer)), input_desc->size);
+      }
+      else
+      {
+        auto from_iodesc = find_from(model_index, ir::SubgraphIndex{0}, ir::IOIndex{i});
+        const auto &from_model_index = std::get<ir::ModelIndex>(from_iodesc);
+        const auto &from_subg_index = std::get<ir::SubgraphIndex>(from_iodesc);
+        const auto &from_ioindex = std::get<ir::IOIndex>(from_iodesc).value();
+
+        // Supported only sequantial execution of models
+        assert(from_model_index.value() < model_index.value());
+        assert(from_subg_index.value() == 0);
+        const auto from_executor = _executors.at({from_model_index, from_subg_index}).get();
+        const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+        if (_edge_quant_tensors.find(to_iodesc) == _edge_quant_tensors.end())
+        {
+          inputs_inter[i] = from_executor->getOutputTensors().at(from_ioindex);
+        }
+        else
+        {
+          inputs_inter[i] = _edge_quant_tensors.at(to_iodesc).get();
+        }
+        assert(inputs_inter[i]->buffer() != nullptr);
+      }
+    }
+
+    // Set outputs of executor
     for (uint32_t i = 0; i < output_size; i++)
-      desc2.outputs[i] = std::make_unique<OutputDesc>(*desc.outputs[i].get());
+    {
+      const auto output_pkg_index = find_output_index(_model_edges->pkg_outputs, model_index,
+                                                      ir::SubgraphIndex{0}, ir::IOIndex{i});
+      const auto output_io_desc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+      if (output_pkg_index != -1)
+      {
+        // Allocate type-aware quantization tensors for nnpkg outputs and set internal tensors
+        if (_pkg_output_quant_tensors.find(output_io_desc) != _pkg_output_quant_tensors.end())
+        {
+          _pkg_output_quant_tensors[output_io_desc]->allocate_buffer();
+
+          outputs_inter[i] = _pkg_output_quant_tensors[output_io_desc].get();
+        }
+        else
+        {
+          outputs_inter[i] = _pkg_output_tensors[output_io_desc].get();
+        }
+
+        // Set buffer of IOTensor
+        auto output_desc = desc.outputs[output_pkg_index].get();
+        _pkg_output_tensors[output_io_desc]->setUserTensor(
+          reinterpret_cast<uint8_t *>(output_desc->buffer), output_desc->size);
+      }
+      else
+      {
+        // Allocate buffer of `from` tensors
+        const auto from_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+        _edge_tensors[from_iodesc]->allocate_buffer();
+        outputs_inter[i] = _edge_tensors[from_iodesc].get();
 
-    executor2->execute(desc2);
+        // Allocate buffer of tensors for type-aware quantization
+        for (const auto &to_iodesc : _edge_map[from_iodesc])
+        {
+          _edge_tensors[from_iodesc]->increase_ref();
+          if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end())
+          {
+            auto type_aware_quant_tensor = _edge_quant_tensors.at(to_iodesc).get();
+            type_aware_quant_tensor->allocate_buffer();
+
+            _edge_tensors[from_iodesc]->decrease_ref();
+          }
+        }
+      }
+    }
+
+    _pkg_input_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run();
+
+    executor->execute(inputs_inter, outputs_inter);
+
+    _edge_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run();
+    _pkg_output_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run();
+
+    // Release input buffers that are no longer needed
+    for (uint32_t i = 0; i < input_size; i++)
+    {
+      const auto input_pkg_index = find_input_index(_model_edges->pkg_inputs, model_index,
+                                                    ir::SubgraphIndex{0}, ir::IOIndex{i});
+
+      const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+      if (input_pkg_index == -1)
+      {
+        if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end())
+        {
+          // Decrease reference count of tensor for type-aware quantization if input tensor is the
+          // tensor
+          const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+          if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end())
+          {
+            _edge_quant_tensors[to_iodesc]->decrease_ref();
+          }
+        }
+        else
+        {
+          // Decrease reference count of `from` tensor if input tensor is the `from` tensor
+          const auto from_iodesc = find_from(model_index, ir::SubgraphIndex{0}, ir::IOIndex{i});
+          _edge_tensors[from_iodesc]->decrease_ref();
+
+          // Decrease reference count of nnpkg inputs
+          if (_pkg_input_quant_tensors.find(to_iodesc) != _pkg_input_quant_tensors.end())
+          {
+            _pkg_input_quant_tensors[to_iodesc]->decrease_ref();
+          }
+        }
+      }
+    }
+
+    // Release output buffers if those buffers are no longer used other executors because of
+    // type-aware quantization
+    // FIXME if tensors for type-aware quantization unified for the same `from` tensor and same type
+    for (uint32_t i = 0; i < output_size; i++)
+    {
+      auto from_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}};
+
+      // Check if other executors will use the buffer of edge tensor
+      const auto &to_list = _edge_map[from_iodesc];
+      if (to_list.size() == 0)
+      {
+        // This condition means `from_iodesc` tensor is an output of nnpkg
+        continue;
+      }
+
+      bool to_be_release =
+        !std::any_of(to_list.begin(), to_list.end(), [&](const ir::IODesc &to_iodesc) {
+          // This condition means another executor uses the buffer of edge tensor
+          return _edge_quant_tensors.find(to_iodesc) == _edge_quant_tensors.end();
+        });
+
+      if (to_be_release)
+      {
+        // This edge tensor's buffer won't be used in other executors
+        // Tensors for type-aware quantization take over the role of this edge tensor instead
+        _edge_tensors[from_iodesc]->decrease_ref();
+      }
+
+      // Decrease reference count of nnpkg outputs
+      if (_pkg_output_quant_tensors.find(from_iodesc) != _pkg_output_quant_tensors.end())
+      {
+        _pkg_output_quant_tensors[from_iodesc]->decrease_ref();
+      }
+    }
   }
 }
 
+// modelCount() iterates _executors.
+// It assumes that Compiler will generate Executor for all models and _executors includes all
+// generated Executor.
+// If nnpackage includes model(s) which has no connection and Compiler does not
+// generate Executor for them, modelCount() return less value than real model count.
+uint16_t Executors::modelCount() const
+{
+  uint16_t model_count = 0;
+  for (; _executors.find(std::make_pair(ir::ModelIndex{model_count}, ir::SubgraphIndex{0})) !=
+         _executors.end();
+       model_count++)
+    ;
+
+  return model_count;
+}
+
 } // namespace exec
 } // namespace onert
diff --git a/runtime/onert/core/src/exec/Executors.h b/runtime/onert/core/src/exec/Executors.h
new file mode 100644
index 000000000..ac7489186
--- /dev/null
+++ b/runtime/onert/core/src/exec/Executors.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_EXEC_EXECUTORS_H__
+#define __ONERT_EXEC_EXECUTORS_H__
+
+#include "exec/IExecutors.h"
+#include "ir/NNPkg.h"
+#include "IPermuteFunction.h"
+
+namespace std
+{
+
+template <> struct hash<std::pair<::onert::ir::ModelIndex, ::onert::ir::SubgraphIndex>>
+{
+  size_t
+  operator()(const std::pair<::onert::ir::ModelIndex, ::onert::ir::SubgraphIndex> &pair) const
+    noexcept
+  {
+    return (hash<uint32_t>()(pair.first.value()) << 16) ^ hash<uint32_t>()(pair.second.value());
+  }
+};
+
+} // namespace std
+
+namespace onert
+{
+namespace exec
+{
+
+/**
+ * @brief Class to gather executors
+ */
+class Executors : public IExecutors
+{
+public:
+  Executors(void) = delete;
+  Executors(std::unique_ptr<ir::ModelEdges> model_edges)
+    : _executors{}, _model_edges{std::move(model_edges)}, _edge_quant_layers{},
+      _edge_quant_tensors{}, _edge_tensors{}, _is_created_edge_quant_layers{false},
+      _pkg_input_quant_layers{}, _pkg_output_quant_layers{}, _pkg_input_quant_tensors{},
+      _pkg_output_quant_tensors{}, _pkg_input_tensors{}, _pkg_output_tensors{}
+  {
+    for (const auto &edge : _model_edges->edges)
+    {
+      _edge_map[edge.from].emplace_back(edge.to);
+    }
+  }
+  Executors(const Executors &) = delete;
+  Executors(Executors &&) = default;
+  ~Executors() = default;
+
+  // TODO Use Executor index
+  void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+               std::unique_ptr<IExecutor> exec) override;
+
+  IExecutor *at(const ir::ModelIndex &model_index,
+                const ir::SubgraphIndex &subg_index) const override;
+
+  uint32_t inputSize() const override;
+
+  uint32_t outputSize() const override;
+
+  const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const override;
+
+  const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const override;
+
+  void execute(const IODescription &desc) override;
+
+private:
+  void checkSupportedMultimodel() const;
+  void createEdgeQuantLayers();
+  void CreatePkgIOTensors(const IODescription &desc);
+  void createPkgIOQuantLayers(const IODescription &desc);
+  uint16_t modelCount() const;
+
+private:
+  // TODO Remove this class
+  class PermuteLayer : public exec::IPermuteFunction
+  {
+  public:
+    PermuteLayer(const std::vector<backend::ITensor *> &inputs,
+                 const std::vector<backend::ITensor *> &outputs)
+    {
+      assert(inputs.size() == outputs.size());
+      _src_tensors = inputs;
+      _dst_tensors = outputs;
+    }
+    virtual ~PermuteLayer() {}
+    void optimize() override {}
+  };
+
+  class EdgeTensor;
+
+private:
+  std::unordered_map<std::pair<ir::ModelIndex, ir::SubgraphIndex>, std::unique_ptr<IExecutor>>
+    _executors;
+
+  // NOTE _model_edges may use different struct type for executor implementation
+  std::unique_ptr<ir::ModelEdges> _model_edges;
+  std::unordered_map<ir::IODesc, std::vector<ir::IODesc>> _edge_map;
+
+  /**
+   * @brief Type-aware quantization layers for edges between executors
+   *
+   */
+  // TODO Move variables related to type-aware quantization for edges into compilation stage
+  // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer
+  std::unordered_map<std::pair<ir::ModelIndex, ir::SubgraphIndex>, std::unique_ptr<PermuteLayer>>
+    _edge_quant_layers;
+
+  /**
+   * @brief Tensors for type-aware quantization of edges
+   *        Key: `to` IODesc, Value: EdgeTensor
+   */
+  //
+  // Q: Why is Key `to` IODesc
+  // A: these tensors are currently created depending on the type of `to`
+  // TODO Unify tensors with the same `from` tensor and same type
+  // NOTE The incomplete type 'EdgeTensor' cannot be declared as unique_ptr.
+  std::unordered_map<ir::IODesc, std::shared_ptr<EdgeTensor>> _edge_quant_tensors;
+
+  /**
+   * @brief Tensors for edges between executors that are not related to type-aware quantization
+   *        Key: `from` IODesc, Value: EdgeTensor
+   */
+  // Q: Why is Key `from` IODesc
+  // A: `from` can be connected to multiple `to`
+  // NOTE The incomplete type 'EdgeTensor' cannot be declared as unique_ptr.
+  std::unordered_map<ir::IODesc, std::shared_ptr<EdgeTensor>> _edge_tensors;
+  /**
+   * @brief Whether type-aware quantization layers for edges between executors are created
+   *
+   */
+  // TODO Remove this member after the creation of type-aware quantization layers for edges
+  //      is moved into compilation stage
+  bool _is_created_edge_quant_layers;
+
+  // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer
+  std::unordered_map<std::pair<ir::ModelIndex, ir::SubgraphIndex>, std::unique_ptr<PermuteLayer>>
+    _pkg_input_quant_layers;
+  // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer
+  std::unordered_map<std::pair<ir::ModelIndex, ir::SubgraphIndex>, std::unique_ptr<PermuteLayer>>
+    _pkg_output_quant_layers;
+  // Edge tensors of nnpkg inputs/outputs for type-aware quantization
+  std::unordered_map<ir::IODesc, std::shared_ptr<EdgeTensor>> _pkg_input_quant_tensors;
+  std::unordered_map<ir::IODesc, std::shared_ptr<EdgeTensor>> _pkg_output_quant_tensors;
+  // IOTensors for user buffer
+  std::unordered_map<ir::IODesc, std::unique_ptr<backend::builtin::IOTensor>> _pkg_input_tensors;
+  std::unordered_map<ir::IODesc, std::unique_ptr<backend::builtin::IOTensor>> _pkg_output_tensors;
+};
+
+} // namespace exec
+} // namespace onert
+
+#endif // __ONERT_EXEC_EXECUTORS_H__
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.cc b/runtime/onert/core/src/exec/IPermuteFunction.cc
new file mode 100644
index 000000000..9d548e6dc
--- /dev/null
+++ b/runtime/onert/core/src/exec/IPermuteFunction.cc
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "IPermuteFunction.h"
+
+#include <cker/operation/Quantize.h>
+#include <cker/operation/Dequantize.h>
+#include "backend/IPortableTensor.h"
+#include "exec/IFunction.h"
+#include "ir/Index.h"
+#include "ir/Shape.h"
+#include <memory>
+#include <misc/polymorphic_downcast.h>
+#include <typeinfo>
+#include "util/Utils.h"
+#include <vector>
+#include <unordered_map>
+
+namespace
+{
+using namespace onert;
+
+inline nnfw::cker::Shape getShape(const backend::ITensor *tensor)
+{
+  const ir::Shape shape = tensor->getShape();
+
+  assert(tensor->layout() == ir::Layout::NHWC);
+
+  auto rank = shape.rank();
+  nnfw::cker::Shape ret(rank);
+  auto data = ret.DimsData();
+  for (int i = 0; i < rank; ++i)
+  {
+    data[i] = shape.dim(i);
+  }
+  return ret;
+}
+
+// Quantize per element
+template <typename InputT, typename OutputT>
+void elementwiseQuantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor)
+{
+  const auto scale = dst_tensor->data_scale();
+  const auto zero_point = dst_tensor->data_zero_point();
+
+  int min_val = std::numeric_limits<OutputT>::min();
+  int max_val = std::numeric_limits<OutputT>::max();
+
+  auto loop_shape = src_tensor->getShape();
+  const auto src_layout = src_tensor->layout();
+  const auto dst_layout = dst_tensor->layout();
+  const bool is_permutation = src_layout != dst_layout && loop_shape.rank() == 4;
+  ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) {
+    const InputT *input_data =
+      reinterpret_cast<const InputT *>(src_tensor->buffer() + src_tensor->calcOffset(coords));
+    int32_t unclamped = static_cast<int32_t>(round(*input_data / scale)) + zero_point;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+
+    ir::Coordinates dst_coords =
+      is_permutation ? ir::convertCoordinates(coords, src_layout, dst_layout) : coords;
+    OutputT *output_data =
+      reinterpret_cast<OutputT *>(dst_tensor->buffer() + dst_tensor->calcOffset(dst_coords));
+    *output_data = clamped;
+  });
+}
+
+// TODO Optimize the case where tensors has the same layout
+template <typename InputT, typename OutputT>
+void quantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor)
+{
+  if (!src_tensor->has_padding() && !dst_tensor->has_padding() &&
+      src_tensor->layout() == dst_tensor->layout() && !src_tensor->is_dynamic())
+  {
+    assert(!dst_tensor->is_dynamic());
+
+    // Call optimized neon kernel
+    nnfw::cker::Quantize(getShape(src_tensor),
+                         reinterpret_cast<const InputT *>(src_tensor->buffer()),
+                         getShape(dst_tensor), reinterpret_cast<OutputT *>(dst_tensor->buffer()),
+                         dst_tensor->data_scale(), dst_tensor->data_zero_point());
+  }
+  else
+  {
+    elementwiseQuantize<InputT, OutputT>(src_tensor, dst_tensor);
+  }
+}
+
+// Dequantize per element
+template <typename InputT, typename OutputT>
+void elementwiseDequantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor)
+{
+  const auto scale = src_tensor->data_scale();
+  const auto zero_point = src_tensor->data_zero_point();
+
+  auto loop_shape = src_tensor->getShape();
+  const auto src_layout = src_tensor->layout();
+  const auto dst_layout = dst_tensor->layout();
+  const bool is_permutation = src_layout != dst_layout && loop_shape.rank() == 4;
+  ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) {
+    const InputT *input_data =
+      reinterpret_cast<const InputT *>(src_tensor->buffer() + src_tensor->calcOffset(coords));
+    const OutputT result = static_cast<OutputT>(scale * (*input_data - zero_point));
+
+    ir::Coordinates dst_coords =
+      is_permutation ? ir::convertCoordinates(coords, src_layout, dst_layout) : coords;
+    OutputT *output_data =
+      reinterpret_cast<OutputT *>(dst_tensor->buffer() + dst_tensor->calcOffset(dst_coords));
+    *output_data = result;
+  });
+}
+
+// TODO Optimize the case where tensors has the same layout
+template <typename InputT, typename OutputT>
+void dequantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor)
+{
+  if (!src_tensor->has_padding() && !dst_tensor->has_padding() &&
+      src_tensor->layout() == dst_tensor->layout() && !src_tensor->is_dynamic())
+  {
+    assert(!dst_tensor->is_dynamic());
+
+    // Call optimized neon kernel
+    nnfw::cker::Dequantize(getShape(src_tensor),
+                           reinterpret_cast<const InputT *>(src_tensor->buffer()),
+                           getShape(dst_tensor), reinterpret_cast<OutputT *>(dst_tensor->buffer()),
+                           src_tensor->data_scale(), src_tensor->data_zero_point());
+  }
+  else
+  {
+    elementwiseDequantize<InputT, OutputT>(src_tensor, dst_tensor);
+  }
+}
+
+template <typename SRC_T, typename DST_T,
+          std::enable_if_t<std::is_base_of<backend::ITensor, SRC_T>::value &&
+                             std::is_base_of<backend::ITensor, DST_T>::value,
+                           bool> = true>
+void typeAwareQuantize(const SRC_T *src_tensor, DST_T *dst_tensor)
+{
+  // TODO Support other types
+  if (src_tensor->data_type() == ir::DataType::FLOAT32)
+  {
+    switch (dst_tensor->data_type())
+    {
+      case ir::DataType::QUANT_UINT8_ASYMM:
+      {
+        quantize<float, uint8_t>(src_tensor, dst_tensor);
+        break;
+      }
+      case ir::DataType::QUANT_INT8_SYMM:
+      {
+        quantize<float, int8_t>(src_tensor, dst_tensor);
+        break;
+      }
+      case ir::DataType::QUANT_INT16_SYMM:
+      {
+        quantize<float, int16_t>(src_tensor, dst_tensor);
+        break;
+      }
+      default:
+      {
+        throw std::runtime_error("IPermuteFunction: Unsupported quantization type");
+        break;
+      }
+    }
+  }
+  else if (dst_tensor->data_type() == ir::DataType::FLOAT32)
+  {
+    switch (src_tensor->data_type())
+    {
+      case ir::DataType::QUANT_UINT8_ASYMM:
+      {
+        dequantize<uint8_t, float>(src_tensor, dst_tensor);
+        break;
+      }
+      case ir::DataType::QUANT_INT8_SYMM:
+      {
+        dequantize<int8_t, float>(src_tensor, dst_tensor);
+        break;
+      }
+      case ir::DataType::QUANT_INT16_SYMM:
+      {
+        dequantize<int16_t, float>(src_tensor, dst_tensor);
+        break;
+      }
+      default:
+      {
+        throw std::runtime_error("IPermuteFunction: Unsupported dequantization type");
+        break;
+      }
+    }
+  }
+  else
+  {
+    throw std::runtime_error("IPermuteFunction: Unsupported type for type-aware quantization yet");
+  }
+}
+
+} // namespace
+
+namespace onert
+{
+namespace exec
+{
+
+void IPermuteFunction::IPermuteFunction::run()
+{
+  // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0)
+  assert(_src_tensors.size() == _dst_tensors.size());
+  if (_src_tensors_offsets.size() == 0)
+  {
+    _src_tensors_offsets.resize(_src_tensors.size());
+    _dst_tensors_offsets.resize(_dst_tensors.size());
+  }
+  assert(_src_tensors.size() == _src_tensors_offsets.size());
+  assert(_src_tensors_offsets.size() == _dst_tensors_offsets.size());
+
+  for (size_t i = 0; i < _src_tensors.size(); ++i)
+  {
+    auto src_tensor = _src_tensors.at(i);
+    auto dst_tensor = _dst_tensors.at(i);
+    auto &src_offsets = _src_tensors_offsets.at(i);
+    auto &dst_offsets = _dst_tensors_offsets.at(i);
+    if (src_tensor != dst_tensor)
+    {
+      const auto rank = src_tensor->getShape().rank();
+      permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+    }
+  }
+}
+
+void IPermuteFunction::permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor,
+                               size_t rank, std::vector<size_t> &src_offsets,
+                               std::vector<size_t> &dst_offsets)
+{
+  if (src_tensor->total_size() == 0)
+  {
+    assert(dst_tensor->total_size() == 0);
+    return;
+  }
+
+  assert(src_tensor != dst_tensor);
+  if (underlying_type(src_tensor->data_type()) != underlying_type(dst_tensor->data_type()))
+  {
+    typeAwareQuantize(src_tensor, dst_tensor);
+    return;
+  }
+
+  switch (src_tensor->data_type())
+  {
+    case ir::DataType::FLOAT32:
+      permute<float>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::INT32:
+      permute<int32_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::UINT32:
+      permute<uint32_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::BOOL8:
+    case ir::DataType::QUANT_UINT8_ASYMM:
+    case ir::DataType::UINT8:
+      permute<uint8_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::QUANT_INT8_ASYMM:
+    case ir::DataType::QUANT_INT8_SYMM:
+      permute<int8_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::INT64:
+      permute<int64_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    case ir::DataType::QUANT_INT16_SYMM:
+      permute<int16_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
+      break;
+    default:
+      throw std::runtime_error("IPermuteFunction: Not supported data type");
+      break;
+  }
+}
+
+const std::type_info &IPermuteFunction::underlying_type(ir::DataType type) const
+{
+  switch (type)
+  {
+    case ir::DataType::FLOAT32:
+      return typeid(float);
+    case ir::DataType::INT32:
+      return typeid(int32_t);
+    case ir::DataType::UINT32:
+      return typeid(uint32_t);
+    case ir::DataType::INT64:
+      return typeid(int64_t);
+    case ir::DataType::BOOL8:
+    case ir::DataType::QUANT_UINT8_ASYMM:
+    case ir::DataType::UINT8:
+      return typeid(uint8_t);
+    case ir::DataType::QUANT_INT8_ASYMM:
+    case ir::DataType::QUANT_INT8_SYMM:
+      return typeid(int8_t);
+    case ir::DataType::QUANT_INT16_SYMM:
+      return typeid(int16_t);
+    default:
+      throw std::runtime_error("IPermuteFunction: Not supported data type");
+  }
+}
+
+} // namespace exec
+} // namespace onert
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h
index eb54b67ae..e790f3290 100644
--- a/runtime/onert/core/src/exec/IPermuteFunction.h
+++ b/runtime/onert/core/src/exec/IPermuteFunction.h
@@ -25,11 +25,7 @@
 
 #include "backend/ITensor.h"
 #include "exec/IFunction.h"
-#include "ir/Index.h"
-#include "ir/Shape.h"
 #include <memory>
-#include <typeinfo>
-#include "util/Utils.h"
 #include <vector>
 #include <unordered_map>
 
@@ -79,31 +75,7 @@ protected:
   };
 
 public:
-  virtual void run() override
-  {
-    // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0)
-    assert(_src_tensors.size() == _dst_tensors.size());
-    if (_src_tensors_offsets.size() == 0)
-    {
-      _src_tensors_offsets.resize(_src_tensors.size());
-      _dst_tensors_offsets.resize(_dst_tensors.size());
-    }
-    assert(_src_tensors.size() == _src_tensors_offsets.size());
-    assert(_src_tensors_offsets.size() == _dst_tensors_offsets.size());
-
-    for (size_t i = 0; i < _src_tensors.size(); ++i)
-    {
-      auto src_tensor = _src_tensors.at(i);
-      auto dst_tensor = _dst_tensors.at(i);
-      auto &src_offsets = _src_tensors_offsets.at(i);
-      auto &dst_offsets = _dst_tensors_offsets.at(i);
-      if (src_tensor != dst_tensor)
-      {
-        const auto rank = src_tensor->getShape().rank();
-        permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-      }
-    }
-  }
+  virtual void run() override;
 
   virtual void prepare() override { optimize(); }
 
@@ -111,48 +83,7 @@ public:
 
 protected:
   void permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, size_t rank,
-               std::vector<size_t> &src_offsets, std::vector<size_t> &dst_offsets)
-  {
-    if (src_tensor->total_size() == 0)
-    {
-      assert(dst_tensor->total_size() == 0);
-      return;
-    }
-
-    assert(src_tensor != dst_tensor);
-    if (underlying_type(src_tensor->data_type()) != underlying_type(dst_tensor->data_type()))
-      throw std::runtime_error("data type does not match");
-    switch (src_tensor->data_type())
-    {
-      case ir::DataType::FLOAT32:
-        permute<float>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::INT32:
-        permute<int32_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::UINT32:
-        permute<uint32_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::BOOL8:
-      case ir::DataType::QUANT_UINT8_ASYMM:
-      case ir::DataType::UINT8:
-        permute<uint8_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::QUANT_INT8_ASYMM:
-      case ir::DataType::QUANT_INT8_SYMM:
-        permute<int8_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::INT64:
-        permute<int64_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      case ir::DataType::QUANT_INT16_SYMM:
-        permute<int16_t>(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
-        break;
-      default:
-        throw std::runtime_error("IPermuteFunction: Not supported data type");
-        break;
-    }
-  }
+               std::vector<size_t> &src_offsets, std::vector<size_t> &dst_offsets);
 
 private:
   // TODO make src const by proving const access()
@@ -322,31 +253,7 @@ protected:
   // NOTE The typeid expression is lvalue expression which refers to an object with static storage
   //      duration, of the polymorphic type const std::type_info or of some type derived from it.
   //      So std::type_info is non-copyable
-  const std::type_info &underlying_type(ir::DataType type) const
-  {
-    switch (type)
-    {
-      case ir::DataType::FLOAT32:
-        return typeid(float);
-      case ir::DataType::INT32:
-        return typeid(int32_t);
-      case ir::DataType::UINT32:
-        return typeid(uint32_t);
-      case ir::DataType::INT64:
-        return typeid(int64_t);
-      case ir::DataType::BOOL8:
-      case ir::DataType::QUANT_UINT8_ASYMM:
-      case ir::DataType::UINT8:
-        return typeid(uint8_t);
-      case ir::DataType::QUANT_INT8_ASYMM:
-      case ir::DataType::QUANT_INT8_SYMM:
-        return typeid(int8_t);
-      case ir::DataType::QUANT_INT16_SYMM:
-        return typeid(int16_t);
-      default:
-        throw std::runtime_error("IPermuteFunction: Not supported data type");
-    }
-  }
+  const std::type_info &underlying_type(ir::DataType type) const;
 
 protected:
   std::vector<backend::ITensor *> _src_tensors;
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.test.cc b/runtime/onert/core/src/exec/IPermuteFunction.test.cc
new file mode 100644
index 000000000..1009f194d
--- /dev/null
+++ b/runtime/onert/core/src/exec/IPermuteFunction.test.cc
@@ -0,0 +1,902 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "IPermuteFunction.h"
+
+#include <ir/Layout.h>
+#include <ir/Shape.h>
+#include <ir/TypeInfo.h>
+
+#include <cmath>
+#include <gtest/gtest.h>
+
+namespace
+{
+using namespace onert;
+using namespace ir;
+using namespace backend;
+using namespace exec;
+
+class MockUpTensor : public ITensor
+{
+public:
+  MockUpTensor(const Shape &shape, const TypeInfo &type_info, Layout layout, size_t pad)
+    : _shape(shape), _type_info(type_info), _data(nullptr), _layout(layout)
+  {
+    _strides.resize(shape.rank());
+
+    std::vector<size_t> pads(shape.rank(), 0);
+    pads[shape.rank() - 1] = pad;
+    size_t stride = 1;
+    for (int32_t i = _shape.rank() - 1; i >= 0; --i)
+    {
+      _strides.at(i) = stride;
+      stride = stride * (_shape.dim(i) + pads.at(i));
+    }
+  }
+  virtual ~MockUpTensor() {}
+
+  void setBuffer(uint8_t *data) { _data = data; }
+
+  size_t total_size() const override
+  {
+    size_t total_size = _strides[0] * _shape.dim(0);
+    total_size *= sizeOfDataType(data_type());
+    return total_size;
+  }
+
+  size_t calcOffset(const ir::Coordinates &coords) const override
+  {
+    size_t offset = 0;
+    for (size_t i = 0; i < _shape.rank(); ++i)
+    {
+      offset += (_strides[i] * coords[i]);
+    }
+    offset *= sizeOfDataType(data_type());
+    return offset;
+  }
+
+  uint8_t *buffer() const override { return _data; }
+
+  ir::Layout layout() const override { return _layout; }
+  ir::DataType data_type() const override { return _type_info.type(); }
+  float data_scale() const override { return _type_info.scale(); }
+  int32_t data_zero_point() const override { return _type_info.zero_point(); }
+  const std::vector<float> &data_scales() const override { return _type_info.scales(); }
+  const std::vector<int32_t> &data_zero_points() const override { return _type_info.zero_points(); }
+  bool has_padding() const override
+  {
+    return total_size() / sizeOfDataType(data_type()) != _shape.num_elements();
+  }
+  void access(const std::function<void(ITensor &tensor)> &fn) final { fn(*this); }
+
+  bool is_dynamic() const override { return false; }
+  Shape getShape() const override { return _shape; }
+
+private:
+  Shape _shape;
+  TypeInfo _type_info;
+  Layout _layout;
+  uint8_t *_data;
+  std::vector<size_t> _strides;
+};
+
+class MockUpLayer : public IPermuteFunction
+{
+public:
+  MockUpLayer(const std::vector<ITensor *> &inputs, const std::vector<ITensor *> &outputs)
+  {
+    assert(inputs.size() == outputs.size());
+    _src_tensors = inputs;
+    _dst_tensors = outputs;
+  }
+  virtual ~MockUpLayer() {}
+  void optimize() override {}
+};
+
+TEST(IPermuteFunction, float_rank1)
+{
+  const size_t input_pads[4] = {0, 1, 0, 2};
+  const size_t output_pads[4] = {0, 0, 2, 1};
+  const std::vector<Shape> shapes{{1}, {4}, {5}, {2}};
+  float expected_buffer[] = {1, 0, -1, -2, 3};
+  const auto type_info = TypeInfo(DataType::FLOAT32);
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      Coordinates coords{j};
+      float result =
+        *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+      float expected =
+        *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+      EXPECT_EQ(result, expected);
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_rank2)
+{
+  const size_t input_pads[4] = {0, 1, 0, 2};
+  const size_t output_pads[4] = {0, 0, 2, 1};
+  const std::vector<Shape> shapes{{1, 4}, {2, 2}, {1, 5}, {2, 3}};
+  float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8};
+  const auto type_info = TypeInfo(DataType::FLOAT32);
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        Coordinates coords{j, k};
+        float result =
+          *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+        float expected =
+          *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+        EXPECT_EQ(result, expected);
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_rank3)
+{
+  const size_t input_pads[4] = {0, 5, 0, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 4, 1}, {1, 2, 1}, {2, 1, 5}, {1, 2, 3}};
+  float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10};
+  const auto type_info = TypeInfo(DataType::FLOAT32);
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          Coordinates coords{j, k, l};
+          float result =
+            *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+          float expected =
+            *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+          EXPECT_EQ(result, expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_rank4)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10};
+  const auto type_info = TypeInfo(DataType::FLOAT32);
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            float result =
+              *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_rank4_layout)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16};
+  const auto type_info = TypeInfo(DataType::FLOAT32);
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    Layout layout = Layout::NHWC;
+    Shape shape = shapes[i];
+    if (i % 2 == 1)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    inputs[i] = std::make_unique<MockUpTensor>(shape, type_info, layout, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    if (layout == Layout::NHWC)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    else
+    {
+      layout = Layout::NHWC;
+      shape = shapes[i];
+    }
+    outputs[i] = std::make_unique<MockUpTensor>(shape, type_info, layout, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates input_coords;
+            Coordinates output_coords;
+            if (inputs[i]->layout() == Layout::NHWC)
+            {
+              input_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              input_coords = Coordinates{j, m, k, l};
+            }
+            if (outputs[i]->layout() == Layout::NHWC)
+            {
+              output_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              output_coords = Coordinates{j, m, k, l};
+            }
+            float result = *reinterpret_cast<float *>(outputs[i]->buffer() +
+                                                      outputs[i]->calcOffset(output_coords));
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(input_coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_to_qasymm8)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 128;
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC,
+                                               input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point};
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            uint8_t qasymm8 =
+              *reinterpret_cast<uint8_t *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            float result = (qasymm8 - zero_point) * scale;
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_to_qsymm8)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 0;
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC,
+                                               input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    TypeInfo type_info{DataType::QUANT_INT8_SYMM, scale, zero_point};
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            int8_t qsymm8 =
+              *reinterpret_cast<int8_t *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            float result = (qsymm8 - zero_point) * scale;
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_to_qsymm16)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 0;
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC,
+                                               input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    TypeInfo type_info{DataType::QUANT_INT16_SYMM, scale, zero_point};
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            int16_t qsymm16 =
+              *reinterpret_cast<int16_t *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            float result = (qsymm16 - zero_point) * scale;
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, qasymm8_to_float)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 128;
+  uint8_t input_buffer[12];
+
+  int32_t min_val = std::numeric_limits<uint8_t>::min();
+  int32_t max_val = std::numeric_limits<uint8_t>::max();
+  for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i)
+  {
+    int32_t unclamped = static_cast<int32_t>(std::round(expected_buffer[i] / scale)) + zero_point;
+    input_buffer[i] = std::min(std::max(unclamped, min_val), max_val);
+  }
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point};
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(input_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32),
+                                                Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            float result =
+              *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            uint8_t qasymm8 =
+              *reinterpret_cast<uint8_t *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            float expected = (qasymm8 - zero_point) * scale;
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, qsymm8_to_float)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 0;
+  uint8_t input_buffer[12];
+
+  int32_t min_val = std::numeric_limits<int8_t>::min();
+  int32_t max_val = std::numeric_limits<int8_t>::max();
+  for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i)
+  {
+    int32_t unclamped = static_cast<int32_t>(std::round(expected_buffer[i] / scale)) + zero_point;
+    input_buffer[i] = std::min(std::max(unclamped, min_val), max_val);
+  }
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    TypeInfo type_info{DataType::QUANT_INT8_SYMM, scale, zero_point};
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(input_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32),
+                                                Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            float result =
+              *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            int8_t qasymm8 =
+              *reinterpret_cast<int8_t *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            float expected = (qasymm8 - zero_point) * scale;
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, qsymm16_to_float)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100};
+  float scale = 10;
+  int32_t zero_point = 0;
+  uint8_t input_buffer[12];
+
+  int32_t min_val = std::numeric_limits<int16_t>::min();
+  int32_t max_val = std::numeric_limits<int16_t>::max();
+  for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i)
+  {
+    int32_t unclamped = static_cast<int32_t>(std::round(expected_buffer[i] / scale)) + zero_point;
+    input_buffer[i] = std::min(std::max(unclamped, min_val), max_val);
+  }
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    TypeInfo type_info{DataType::QUANT_INT16_SYMM, scale, zero_point};
+    inputs[i] = std::make_unique<MockUpTensor>(shapes[i], type_info, Layout::NHWC, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(input_buffer));
+
+    outputs[i] = std::make_unique<MockUpTensor>(shapes[i], TypeInfo(DataType::FLOAT32),
+                                                Layout::NHWC, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates coords{j, k, l, m};
+            float result =
+              *reinterpret_cast<float *>(outputs[i]->buffer() + outputs[i]->calcOffset(coords));
+            int16_t qasymm8 =
+              *reinterpret_cast<int16_t *>(inputs[i]->buffer() + inputs[i]->calcOffset(coords));
+            float expected = (qasymm8 - zero_point) * scale;
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, float_to_qasymm8_layout)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10,  0,  -10,  -20, 30,   -40, 50,   -60, 70,
+                             -80, 90, -100, 110, -120, 130, -140, 150, -160};
+  float scale = 10;
+  int32_t zero_point = 128;
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    Layout layout = Layout::NHWC;
+    Shape shape = shapes[i];
+    if (i % 2 == 1)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    inputs[i] =
+      std::make_unique<MockUpTensor>(shape, TypeInfo(DataType::FLOAT32), layout, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    if (layout == Layout::NHWC)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    else
+    {
+      layout = Layout::NHWC;
+      shape = shapes[i];
+    }
+    TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point};
+    outputs[i] = std::make_unique<MockUpTensor>(shape, type_info, layout, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates input_coords;
+            Coordinates output_coords;
+            if (inputs[i]->layout() == Layout::NHWC)
+            {
+              input_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              input_coords = Coordinates{j, m, k, l};
+            }
+            if (outputs[i]->layout() == Layout::NHWC)
+            {
+              output_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              output_coords = Coordinates{j, m, k, l};
+            }
+            uint8_t qasymm8 = *reinterpret_cast<uint8_t *>(outputs[i]->buffer() +
+                                                           outputs[i]->calcOffset(output_coords));
+            float result = (qasymm8 - zero_point) * scale;
+            float expected =
+              *reinterpret_cast<float *>(inputs[i]->buffer() + inputs[i]->calcOffset(input_coords));
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(IPermuteFunction, asymm8_to_float_layout)
+{
+  const size_t input_pads[4] = {0, 0, 1, 2};
+  const size_t output_pads[4] = {0, 3, 2, 1};
+  const std::vector<Shape> shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}};
+  float expected_buffer[] = {10,  0,  -10,  -20, 30,   -40, 50,   -60, 70,
+                             -80, 90, -100, 110, -120, 130, -140, 150, -160};
+  float scale = 10;
+  int32_t zero_point = 128;
+  uint8_t input_buffer[18];
+
+  int32_t min_val = std::numeric_limits<int16_t>::min();
+  int32_t max_val = std::numeric_limits<int16_t>::max();
+  for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i)
+  {
+    int32_t unclamped = static_cast<int32_t>(std::round(expected_buffer[i] / scale)) + zero_point;
+    input_buffer[i] = std::min(std::max(unclamped, min_val), max_val);
+  }
+
+  std::vector<std::unique_ptr<MockUpTensor>> inputs(4);
+  std::vector<std::unique_ptr<MockUpTensor>> outputs(4);
+  std::vector<std::unique_ptr<uint8_t[]>> output_buffers(4);
+  for (size_t i = 0; i < 4; ++i)
+  {
+    Layout layout = Layout::NHWC;
+    Shape shape = shapes[i];
+    if (i % 2 == 1)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point};
+    inputs[i] = std::make_unique<MockUpTensor>(shape, type_info, layout, input_pads[i]);
+    inputs[i]->setBuffer(reinterpret_cast<uint8_t *>(expected_buffer));
+
+    if (layout == Layout::NHWC)
+    {
+      layout = Layout::NCHW;
+      shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)};
+    }
+    else
+    {
+      layout = Layout::NHWC;
+      shape = shapes[i];
+    }
+    outputs[i] =
+      std::make_unique<MockUpTensor>(shape, TypeInfo(DataType::FLOAT32), layout, output_pads[i]);
+    output_buffers[i] = std::make_unique<uint8_t[]>(outputs[i]->total_size());
+    outputs[i]->setBuffer(output_buffers[i].get());
+  }
+
+  auto mockup_layer = std::make_unique<MockUpLayer>(
+    std::vector<ITensor *>{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()},
+    std::vector<ITensor *>{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()});
+  mockup_layer->run();
+
+  for (size_t i = 0; i < 4; ++i)
+  {
+    for (int32_t j = 0; j < shapes[i].dim(0); ++j)
+    {
+      for (int32_t k = 0; k < shapes[i].dim(1); ++k)
+      {
+        for (int32_t l = 0; l < shapes[i].dim(2); ++l)
+        {
+          for (int32_t m = 0; m < shapes[i].dim(3); ++m)
+          {
+            Coordinates input_coords;
+            Coordinates output_coords;
+            if (inputs[i]->layout() == Layout::NHWC)
+            {
+              input_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              input_coords = Coordinates{j, m, k, l};
+            }
+            if (outputs[i]->layout() == Layout::NHWC)
+            {
+              output_coords = Coordinates{j, k, l, m};
+            }
+            else
+            {
+              output_coords = Coordinates{j, m, k, l};
+            }
+            float result = *reinterpret_cast<float *>(outputs[i]->buffer() +
+                                                      outputs[i]->calcOffset(output_coords));
+            uint8_t qasymm8 = *reinterpret_cast<uint8_t *>(inputs[i]->buffer() +
+                                                           inputs[i]->calcOffset(input_coords));
+            float expected = (qasymm8 - zero_point) * scale;
+            EXPECT_EQ(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
diff --git a/runtime/onert/core/src/exec/ParallelScheduler.cc b/runtime/onert/core/src/exec/ParallelScheduler.cc
index 70c9c3dd6..456663f91 100644
--- a/runtime/onert/core/src/exec/ParallelScheduler.cc
+++ b/runtime/onert/core/src/exec/ParallelScheduler.cc
@@ -45,7 +45,7 @@ void ParallelScheduler::assign(std::unique_ptr<IFunction> &&fn, const backend::B
 
 void ParallelScheduler::finish()
 {
-  for (auto &itr : _thread_pools)
+  for (auto &&itr : _thread_pools)
   {
     itr.second->finish();
   }
diff --git a/runtime/onert/core/src/exec/SingleModelExecutors.cc b/runtime/onert/core/src/exec/SingleModelExecutors.cc
new file mode 100644
index 000000000..4b954bab2
--- /dev/null
+++ b/runtime/onert/core/src/exec/SingleModelExecutors.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SingleModelExecutors.h"
+
+#include "../backend/builtin/IOTensor.h"
+
+namespace onert
+{
+namespace exec
+{
+
+void SingleModelExecutors::emplace(const ir::ModelIndex &, const ir::SubgraphIndex &subg_index,
+                                   std::unique_ptr<IExecutor> exec)
+{
+  _executors.emplace(subg_index, std::move(exec));
+}
+
+IExecutor *SingleModelExecutors::at(const ir::ModelIndex &,
+                                    const ir::SubgraphIndex &subg_index) const
+{
+  return _executors.at(subg_index).get();
+}
+
+uint32_t SingleModelExecutors::inputSize() const
+{
+  return entryExecutor()->getInputTensors().size();
+}
+
+uint32_t SingleModelExecutors::outputSize() const
+{
+  return entryExecutor()->getOutputTensors().size();
+}
+
+const ir::OperandInfo &SingleModelExecutors::inputInfo(const ir::IOIndex &index) const
+{
+  return entryExecutor()->getInputTensors().at(index.value())->orig_info();
+}
+
+const ir::OperandInfo &SingleModelExecutors::outputInfo(const ir::IOIndex &index) const
+{
+  return entryExecutor()->getOutputTensors().at(index.value())->orig_info();
+}
+
+void SingleModelExecutors::execute(const IODescription &desc) { entryExecutor()->execute(desc); }
+
+} // namespace exec
+} // namespace onert
diff --git a/runtime/onert/core/src/exec/SingleModelExecutors.h b/runtime/onert/core/src/exec/SingleModelExecutors.h
new file mode 100644
index 000000000..98d629eae
--- /dev/null
+++ b/runtime/onert/core/src/exec/SingleModelExecutors.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__
+#define __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__
+
+#include "exec/IExecutors.h"
+#include "ir/NNPkg.h"
+
+namespace onert
+{
+namespace exec
+{
+
+/**
+ * @brief Class to gather executor set for single model NN package
+ */
+class SingleModelExecutors : public IExecutors
+{
+public:
+  /**
+   * @brief Construct a new SingleModelExecutors object
+   */
+  SingleModelExecutors(void) = default;
+  SingleModelExecutors(const SingleModelExecutors &) = delete;
+  SingleModelExecutors(SingleModelExecutors &&) = default;
+
+  /**
+   * @brief Destroy the SingleModelExecutors object
+   */
+  ~SingleModelExecutors() = default;
+
+public:
+  void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index,
+               std::unique_ptr<IExecutor> exec) override;
+
+  IExecutor *at(const ir::ModelIndex &model_index,
+                const ir::SubgraphIndex &subg_index) const override;
+
+  uint32_t inputSize() const override;
+
+  uint32_t outputSize() const override;
+
+  const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const override;
+
+  const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const override;
+
+  void execute(const IODescription &desc) override;
+
+private:
+  std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>> _executors;
+};
+
+} // namespace exec
+} // namespace onert
+
+#endif // __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__
diff --git a/runtime/onert/core/src/exec/ThreadPool.cc b/runtime/onert/core/src/exec/ThreadPool.cc
index c8e0e3265..bf85e59f6 100644
--- a/runtime/onert/core/src/exec/ThreadPool.cc
+++ b/runtime/onert/core/src/exec/ThreadPool.cc
@@ -48,7 +48,7 @@ uint32_t ThreadPool::numJobsInQueue() { return _worker.numJobsInQueue(); }
 
 void ThreadPool::join()
 {
-  for (auto &thread : _threads)
+  for (auto &&thread : _threads)
   {
     thread.join();
   }
diff --git a/runtime/onert/core/src/interp/Buffer.h b/runtime/onert/core/src/interp/Buffer.h
deleted file mode 100644
index 24938f74f..000000000
--- a/runtime/onert/core/src/interp/Buffer.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file  Buffer.h
- * @brief This file contains Buffer interface and InternalBuffer, ExternalBuffer class
- */
-#ifndef __ONERT_INTERP_BUFFER_H__
-#define __ONERT_INTERP_BUFFER_H__
-
-#include <memory>
-
-#include "ir/Data.h"
-
-namespace onert
-{
-namespace interp
-{
-
-/**
- * @brief Interface for writable data area
- */
-class Buffer : public ir::Data
-{
-public:
-  /**
-   * @brief   Return writable pointer for data area
-   * @return  Writable pointer
-   */
-  virtual uint8_t *baseWritable(void) const = 0;
-};
-
-/**
- * @brief Class for internally allocated data area
- */
-class InternalBuffer final : public Buffer
-{
-public:
-  InternalBuffer(size_t size) : _base{std::make_unique<uint8_t[]>(size)}, _size{size}
-  {
-    // DO NOTHING
-  }
-
-public:
-  size_t size(void) const override { return _size; }
-  const uint8_t *base(void) const override { return _base.get(); }
-  uint8_t *baseWritable(void) const override { return _base.get(); }
-
-private:
-  std::unique_ptr<uint8_t[]> _base;
-  size_t _size;
-};
-
-/**
- * @brief Class for data area from outside
- */
-class ExternalBuffer final : public Buffer
-{
-public:
-  ExternalBuffer(uint8_t *base, size_t size) : _base{base}, _size{size}
-  {
-    // DO NOTHING
-  }
-
-public:
-  size_t size(void) const override { return _size; }
-  const uint8_t *base(void) const override { return _base; }
-  uint8_t *baseWritable(void) const override { return _base; }
-
-private:
-  uint8_t *_base;
-  size_t _size;
-};
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_BUFFER_H__
diff --git a/runtime/onert/core/src/interp/ExecEnv.h b/runtime/onert/core/src/interp/ExecEnv.h
deleted file mode 100644
index 7f577ea6e..000000000
--- a/runtime/onert/core/src/interp/ExecEnv.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file  ExecEnv.h
- * @brief This file contains ExecEnv to access interpreter tensor and execution status
- */
-#ifndef __ONERT_INTERP_EXEC_ENV_H_
-#define __ONERT_INTERP_EXEC_ENV_H_
-
-#include <unordered_set>
-
-#include "ir/Graph.h"
-#include "Tensor.h"
-
-namespace onert
-{
-namespace interp
-{
-
-/**
- * @brief Class to gather interpreter execution environment
- *        Each interpreter instance own execution environment
- */
-class ExecEnv
-{
-public:
-  /**
-   * @brief Construct a new Exec Env object (deleted)
-   */
-  ExecEnv(void) = delete;
-  /**
-   * @brief Construct a new ExecEnv object
-   * @param[in] graph Graph to execute by interpreter
-   */
-  explicit ExecEnv(const ir::Graph &graph) : _graph(graph)
-  {
-    // DO NOTHING
-  }
-
-public:
-  /**
-   * @brief   Return graph to execute
-   * @return  Graph
-   */
-  const ir::Graph &graph(void) const { return _graph; }
-  /**
-   * @brief     Assign tensor to environment which have allocated or assigned buffer
-   * @param[in] index   Tensor index
-   * @param[in] tensor  Tensor
-   */
-  void assignTensor(const ir::OperandIndex index, std::shared_ptr<ITensor> tensor)
-  {
-    assert(tensor->bufferRO() != nullptr);
-    _tensors.emplace(index, tensor);
-  }
-
-  /**
-   * @brief     Return tensor pointer in environment
-   * @param[in] index         Tensor index
-   *            can_optional  @c True if tensor can be optional input, otherwise @c false
-   * @return    Tensor pointer
-   */
-  const ITensor *tensorAt(const ir::OperandIndex index, bool can_optional = false) const
-  {
-    if (_tensors.find(index) == _tensors.end())
-    {
-      // It may optional input,
-      // otherwise input is not set by runtime user
-      if (can_optional)
-      {
-        return nullptr;
-      }
-
-      throw std::runtime_error{"ExecEnv: Input is not set"};
-    }
-
-    return _tensors.at(index).get();
-  }
-
-  /**
-   * @brief     Check environment contains tensor
-   * @param[in] index Tensor index
-   * @return    @c true if environment contain tensor, otherwise @c false
-   */
-  bool contains(const ir::OperandIndex index) const
-  {
-    return (_tensors.find(index) != _tensors.end());
-  }
-
-  /**
-   * @brief     Allocate tensor using operand info
-   * @param[in] index     Tensor index
-   * @param[in] info      Operand info
-   * @note      If already allocated, just return
-   * @TODO      More smart allocation policy
-   */
-  void allocateIfNeeded(const ir::OperandIndex index, const ir::OperandInfo &info)
-  {
-    // already allocated, or constant
-    if (contains(index))
-    {
-      return;
-    }
-
-    // Buffer from external (ex. model output)
-    auto tensor = std::make_shared<Tensor>(info);
-    if (isExtBuffer(index))
-    {
-      tensor->setBuffer(_external_buffers.at(index));
-      assignTensor(index, tensor);
-
-      return;
-    }
-
-    tensor->setBuffer(std::make_shared<InternalBuffer>(tensor->total_size()));
-    assignTensor(index, tensor);
-    _buffers.insert(index);
-  }
-
-  /**
-   * @brief     Allocate read-only tensor and share data with other tensor
-   * @param[in] index           Tensor index
-   * @param[in] info            Operand info
-   * @param[in] index_to_share  Tensor index that have data to share
-   */
-  void allocateAndShareIfNeeded(const ir::OperandIndex index, const ir::OperandInfo &info,
-                                const ir::OperandIndex index_to_share)
-  {
-    if (!contains(index_to_share))
-    {
-      throw std::runtime_error{"Cannot find tensor to share data"};
-    }
-
-    // already allocated
-    if (contains(index))
-    {
-      return;
-    }
-
-    if (isExtBuffer(index))
-    {
-      auto tensor = std::make_shared<Tensor>(info);
-      tensor->setBuffer(_external_buffers.at(index));
-      assignTensor(index, tensor);
-    }
-    else
-    {
-      auto tensor = std::make_shared<ROTensor>(info);
-      tensor->setData(tensorAt(index_to_share)->shareData());
-      assignTensor(index, tensor);
-      _buffers.insert(index);
-    }
-  }
-
-  /**
-   * @brief     Free buffer if allocated by allocateIfNeed
-   * @param[in] index Tensor index
-   * @note      If allocated by outside, just return
-   */
-  void freeIfAllocated(const ir::OperandIndex index)
-  {
-    if (_buffers.find(index) != _buffers.end())
-    {
-      _tensors.at(index)->releaseData();
-    }
-  }
-
-  /**
-   * @brief     Assign ExternalBuffer into external buffer map
-   * @param[in] index   Tensor index
-   * @param[in] buffer  External buffer
-   */
-  void assignExternalBuffer(const ir::OperandIndex index, std::shared_ptr<ExternalBuffer> buffer)
-  {
-    _external_buffers.emplace(index, buffer);
-  }
-
-private:
-  bool isExtBuffer(const ir::OperandIndex index)
-  {
-    return (_external_buffers.find(index) != _external_buffers.end());
-  }
-
-private:
-  const ir::Graph &_graph;
-  // Tensor map to use in interpreter
-  // It should map tensors that have allocated or assigned buffer pointer
-  std::unordered_map<ir::OperandIndex, std::shared_ptr<ITensor>> _tensors;
-  // Tensors allocated by allocateIfNeed (buffer)
-  std::unordered_set<ir::OperandIndex> _buffers;
-  // Tensor buffer from external
-  std::unordered_map<ir::OperandIndex, std::shared_ptr<ExternalBuffer>> _external_buffers;
-};
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_EXEC_ENV_H_
diff --git a/runtime/onert/core/src/interp/InterpExecutor.cc b/runtime/onert/core/src/interp/InterpExecutor.cc
deleted file mode 100644
index f04777174..000000000
--- a/runtime/onert/core/src/interp/InterpExecutor.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "InterpExecutor.h"
-
-#include "ExecEnv.h"
-#include "Interpreter.h"
-
-#include "util/logging.h"
-
-#include <memory>
-
-namespace onert
-{
-namespace interp
-{
-
-void InterpExecutor::execute(const exec::IODescription &desc)
-{
-  /************************************************************************
-   * Prepare execution model (submodel)
-     It may execute divided model
-     but now consider model inference is done at interpreter
-   ***********************************************************************/
-  ir::OperandIndexMap<std::shared_ptr<ITensor>> tensor_map;
-
-  for (uint32_t n = 0; n < _graph.getInputs().size(); n++)
-  {
-    ir::IOIndex index{n};
-    const auto input_index = _graph.getInputs().at(index);
-
-    const auto input = desc.inputs.at(n).get();
-    if (input == nullptr)
-    {
-      // Optional input
-      continue;
-    }
-
-    auto input_tensor = std::make_shared<ROTensor>(input->info);
-    input_tensor->setData(std::make_shared<const ir::ExternalData>(
-      reinterpret_cast<const uint8_t *>(input->buffer), input->size));
-    tensor_map[input_index] = input_tensor;
-  }
-
-  /************************************************************************
-   * Prepare execution environment
-     Execution environment will be assigned to invoked interpreter instance
-   ***********************************************************************/
-
-  std::unique_ptr<ExecEnv> interp_env = std::make_unique<ExecEnv>(_graph);
-
-  // Assign input/output tensor into interpreter execution environment
-  for (auto index : _graph.getInputs())
-  {
-    if (tensor_map.find(index) != tensor_map.end())
-    {
-      VERBOSE(INTERPRETER) << "Assign input tensor. operand index:" << index << std::endl;
-      interp_env->assignTensor(index, tensor_map.at(index));
-    }
-  }
-
-  for (uint32_t n = 0; n < _graph.getOutputs().size(); n++)
-  {
-    ir::IOIndex index{n};
-    const auto output_index = _graph.getOutputs().at(index);
-    const auto output = desc.outputs.at(n).get();
-    if (output == nullptr)
-    {
-      // Optional output
-      continue;
-    }
-
-    VERBOSE(INTERPRETER) << "Set out buffer to ExecEnv. operand index:" << output_index.value()
-                         << std::endl;
-
-    interp_env->assignExternalBuffer(
-      output_index,
-      std::make_shared<ExternalBuffer>(reinterpret_cast<uint8_t *>(output->buffer), output->size));
-  }
-
-  // Allocate constant tensor
-  _graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
-    if (obj.isConstant())
-    {
-      VERBOSE(INTERPRETER) << "Allocate and assign constant tensor. operand index:" << ind
-                           << std::endl;
-
-      assert(obj.data());
-      auto const_tensor = std::make_shared<ROTensor>(obj.info());
-      // Assume that interpreter's tensor layout is same with model (NHWC)
-      const_tensor->setData(
-        std::make_shared<ir::ExternalData>(obj.data()->base(), obj.info().total_size()));
-      interp_env->assignTensor(ind, const_tensor);
-    }
-  });
-
-  /*****************************************************************************
-   * Invoke interpreter
-   ****************************************************************************/
-
-  interp::Interpreter interp(std::move(interp_env));
-  interp.run();
-
-  /*****************************************************************************
-   * Invoked interpreter run is finished
-   ****************************************************************************/
-
-  // If interpreter execute submodel
-  //  1. Get tensor output of submodel into tensor_map to save result
-  //  2. Generate new ExecEnv for next interpretation
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/InterpExecutor.h b/runtime/onert/core/src/interp/InterpExecutor.h
deleted file mode 100644
index d6d5dd0a3..000000000
--- a/runtime/onert/core/src/interp/InterpExecutor.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file  InterpExecutor.h
- * @brief This file contains InterpExecutor class\n
- *        to manage interpreter execution and environment
- */
-#ifndef __ONERT_INTERP_INTERP_EXECUTOR_H__
-#define __ONERT_INTERP_INTERP_EXECUTOR_H__
-
-#include "ir/OperandIndexMap.h"
-#include "ir/Graph.h"
-#include "exec/IExecutor.h"
-
-namespace onert
-{
-namespace interp
-{
-
-class ITensor;
-
-/**
- * @brief Class to execute model using interpreter
- */
-class InterpExecutor final : public exec::IExecutor
-{
-public:
-  explicit InterpExecutor(const ir::Graph &graph) : _graph(graph)
-  {
-    // DO NOTHING
-  }
-
-public:
-  /**
-   * @brief   Return graph object
-   * @return  Graph object
-   */
-  const ir::Graph &graph() final { return _graph; }
-
-  const ir::Graph &parent_graph() final
-  {
-    throw new std::runtime_error{"Interpreter does not support this function."};
-  }
-  void setIndexedRanks(std::shared_ptr<ir::OperationIndexMap<int64_t>>) override{
-    // Not implemented
-  };
-  /**
-   * @brief  Start execution
-   * @note   It should be called after setting input and output buffer
-   */
-  void execute(const exec::IODescription &desc) final;
-  void execute(const std::vector<backend::IPortableTensor *> &,
-               const std::vector<backend::IPortableTensor *> &) final
-  {
-    throw new std::runtime_error{"Interpreter does not support subgraph calls(control flow ops)"};
-  }
-  const std::vector<backend::builtin::IOTensor *> &getOutputTensors() const final
-  {
-    throw new std::runtime_error{"Interpreter does not support this function."};
-  }
-
-private:
-  /**
-   * @brief Copy of target graph for lowering
-   * @note  It uses copy of graph, not reference.
-   *        Original graph may be deallocated by frontend.
-   */
-  const ir::Graph _graph;
-  ir::OperandIndexMap<std::shared_ptr<ITensor>> _tensor_map;
-};
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_INTERP_EXECUTOR_H__
diff --git a/runtime/onert/core/src/interp/InterpExecutor.test.cc b/runtime/onert/core/src/interp/InterpExecutor.test.cc
deleted file mode 100644
index 9f95ffee0..000000000
--- a/runtime/onert/core/src/interp/InterpExecutor.test.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "InterpExecutor.h"
-
-#include "exec/Execution.h"
-#include "ir/Graph.h"
-#include "ir/operation/BinaryArithmetic.h"
-
-#include <gtest/gtest.h>
-
-#include <memory>
-
-namespace
-{
-
-using namespace onert::ir;
-using InterpExecutor = onert::interp::InterpExecutor;
-using Execution = onert::exec::Execution;
-using Executors = onert::exec::Executors;
-
-class InterpExecutorTest : public ::testing::Test
-{
-protected:
-  virtual void SetUp() {}
-  void CreateSimpleModel()
-  {
-    // Model: one elementwise add operation
-    // model input: lhs, rhs
-    // model output: add result
-    // lhs, rhs, result shape: {1, 2, 2, 1}
-    // activation: none (constant)
-    _graph = std::make_unique<Graph>();
-
-    // Add operands
-
-    Shape shape{1, 2, 2, 1};
-    TypeInfo type{DataType::INT32};
-    Shape shape_scalar(0);
-    TypeInfo type_scalar{DataType::INT32};
-
-    auto operand_lhs = _graph->addOperand(shape, type);
-    auto operand_rhs = _graph->addOperand(shape, type);
-    auto operand_result = _graph->addOperand(shape, type);
-
-    // Add operations
-
-    operation::BinaryArithmetic::Param param;
-    param.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
-    param.activation = Activation::NONE;
-    auto input_set = OperandIndexSequence{operand_lhs, operand_rhs};
-    auto output_set = OperandIndexSequence{operand_result};
-    _graph->addOperation(
-      std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
-
-    // Identify model inputs and outputs
-
-    _graph->getInputs().append(operand_lhs);
-    _graph->getInputs().append(operand_rhs);
-    _graph->getOutputs().append(operand_result);
-
-    _graph->verify();
-
-    auto model = std::make_shared<onert::ir::Model>();
-    model->push(onert::ir::SubgraphIndex{0}, _graph);
-
-    _executors = std::make_shared<Executors>();
-    _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph));
-  }
-
-  void CreateTwoStepModel()
-  {
-    // Model: two elementwise add operation
-    // model input: lhs, rhs1
-    // model output: second add result (result2)
-    // constant: rhs2
-    // result1 <= (lhs + rhs)
-    // result2 <= (result1 + rhs2)
-    // lhs, rhs1, rh2, result1, result2 shape: {1, 2, 2, 1}
-    // activation: none (constant)
-    _graph = std::make_unique<Graph>();
-
-    // 1st add operands (result1 <= lhs + rhs1)
-
-    Shape shape{1, 2, 2, 1};
-    TypeInfo type{DataType::INT32};
-    Shape shape_scalar(0);
-    TypeInfo type_scalar{DataType::INT32};
-
-    static int32_t rhs2_data[4] = {3, 1, -1, 5};
-
-    auto operand_lhs = _graph->addOperand(shape, type);
-    auto operand_rhs1 = _graph->addOperand(shape, type);
-    auto operand_result1 = _graph->addOperand(shape, type);
-    auto operand_rhs2 = _graph->addOperand(shape, type);
-    auto operand_result2 = _graph->addOperand(shape, type);
-    _graph->operands()
-      .at(operand_rhs2)
-      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&rhs2_data), 16));
-
-    // 2nd add operations (result2 <= result1 + rhs2)
-
-    operation::BinaryArithmetic::Param param1;
-    param1.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
-    param1.activation = Activation::NONE;
-    auto input_set1 = OperandIndexSequence{operand_lhs, operand_rhs1};
-    auto output_set1 = OperandIndexSequence{operand_result1};
-    _graph->addOperation(
-      std::make_unique<operation::BinaryArithmetic>(input_set1, output_set1, param1));
-
-    operation::BinaryArithmetic::Param param2;
-    param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
-    param2.activation = Activation::NONE;
-    auto input_set2 = OperandIndexSequence{operand_result1, operand_rhs2};
-    auto output_set2 = OperandIndexSequence{operand_result2};
-    _graph->addOperation(
-      std::make_unique<operation::BinaryArithmetic>(input_set2, output_set2, param2));
-
-    // Identify model inputs and outputs
-
-    _graph->getInputs().append(operand_lhs);
-    _graph->getInputs().append(operand_rhs1);
-    _graph->getOutputs().append(operand_result2);
-
-    _graph->verify();
-
-    auto model = std::make_shared<onert::ir::Model>();
-    model->push(onert::ir::SubgraphIndex{0}, _graph);
-
-    _executors = std::make_shared<Executors>();
-    _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph));
-  }
-
-  void CreateUnspecifiedDimensionsModel()
-  {
-    // Model: one elementwise add operation
-    // model input: lhs, rhs
-    // model output: add result
-    // lhs, rhs, result shape: {1, unknown, 2, 1}
-    // activation: none (constant)
-    _graph = std::make_unique<Graph>();
-
-    // Add operands
-
-    Shape shape{1, 0, 2, 1};
-    TypeInfo type{DataType::INT32};
-    Shape shape_scalar(0);
-    TypeInfo type_scalar{DataType::INT32};
-
-    auto operand_lhs = _graph->addOperand(shape, type);
-    auto operand_rhs = _graph->addOperand(shape, type);
-
-    auto operand_activation = _graph->addOperand(shape_scalar, type_scalar);
-    _graph->operands()
-      .at(operand_activation)
-      .data(std::make_unique<CachedData>(reinterpret_cast<const uint8_t *>(&_activation_value), 4));
-
-    auto operand_result = _graph->addOperand(shape, type);
-
-    // Add operations
-
-    operation::BinaryArithmetic::Param param;
-    param.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD;
-    param.activation = Activation::NONE;
-    auto input_set = OperandIndexSequence{operand_lhs, operand_rhs};
-    auto output_set = OperandIndexSequence{operand_result};
-    _graph->addOperation(
-      std::make_unique<operation::BinaryArithmetic>(input_set, output_set, param));
-
-    // Identify model inputs and outputs
-
-    _graph->getInputs().append(operand_lhs);
-    _graph->getInputs().append(operand_rhs);
-    _graph->getOutputs().append(operand_result);
-
-    _graph->verify();
-
-    auto model = std::make_shared<onert::ir::Model>();
-    model->push(onert::ir::SubgraphIndex{0}, _graph);
-
-    _executors = std::make_shared<Executors>();
-    _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique<InterpExecutor>(*_graph));
-  }
-
-  void createExecution() { _execution = std::make_unique<Execution>(_executors); }
-
-  virtual void TearDown() { _executors = nullptr; }
-
-  std::shared_ptr<Graph> _graph{nullptr};
-  std::shared_ptr<Executors> _executors{nullptr};
-  std::unique_ptr<Execution> _execution{nullptr};
-  const int32_t _activation_value{0};
-};
-
-TEST_F(InterpExecutorTest, create_empty)
-{
-  Graph graph;
-  graph.verify();
-  auto executor = std::make_unique<InterpExecutor>(graph);
-  ASSERT_NE(executor, nullptr);
-}
-
-TEST_F(InterpExecutorTest, create_simple)
-{
-  CreateSimpleModel();
-  ASSERT_NE(_executors, nullptr);
-  ASSERT_NE(_executors->at(onert::ir::SubgraphIndex{0}), nullptr);
-}
-
-TEST_F(InterpExecutorTest, neg_setInput)
-{
-  CreateSimpleModel();
-  createExecution();
-
-  auto input1 = IOIndex{0};
-  const int32_t input1_buffer[4] = {1, 0, -1, -2};
-
-  EXPECT_THROW(_execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 4),
-               std::runtime_error);
-  EXPECT_THROW(_execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 12),
-               std::runtime_error);
-  EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16));
-}
-
-TEST_F(InterpExecutorTest, neg_setOutput)
-{
-  CreateSimpleModel();
-  createExecution();
-
-  auto output = IOIndex{0};
-  auto output_idx = _graph->getOutputs().at(output);
-
-  int32_t output_buffer[4] = {};
-
-  EXPECT_THROW(_execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 4),
-               std::runtime_error);
-  EXPECT_THROW(_execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 12),
-               std::runtime_error);
-  EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16));
-}
-
-TEST_F(InterpExecutorTest, neg_setInputForUnspecifiedDimensions)
-{
-  CreateUnspecifiedDimensionsModel();
-  createExecution();
-
-  auto input1 = IOIndex{0};
-  const int32_t input1_buffer[4] = {1, 0, -1, -2};
-
-  TypeInfo operand_type{DataType::INT32};
-  Shape operand_shape{1, 2, 2, 1};
-
-  EXPECT_THROW(_execution->setInput(input1, operand_type, operand_shape,
-                                    reinterpret_cast<const void *>(input1_buffer), 4),
-               std::runtime_error);
-  EXPECT_THROW(_execution->setInput(input1, operand_type, operand_shape,
-                                    reinterpret_cast<const void *>(input1_buffer), 12),
-               std::runtime_error);
-  EXPECT_NO_THROW(_execution->setInput(input1, operand_type, operand_shape,
-                                       reinterpret_cast<const void *>(input1_buffer), 16));
-}
-
-TEST_F(InterpExecutorTest, neg_setOutputForUnspecifiedDimensions)
-{
-  CreateUnspecifiedDimensionsModel();
-  createExecution();
-
-  auto output = IOIndex{0};
-  auto output_idx = _graph->getOutputs().at(output);
-
-  TypeInfo operand_type{DataType::INT32};
-  Shape operand_shape{1, 2, 2, 1};
-
-  int32_t output_buffer[4] = {};
-
-  EXPECT_THROW(_execution->setOutput(output, operand_type, operand_shape,
-                                     reinterpret_cast<void *>(output_buffer), 4),
-               std::runtime_error);
-  EXPECT_THROW(_execution->setOutput(output, operand_type, operand_shape,
-                                     reinterpret_cast<void *>(output_buffer), 12),
-               std::runtime_error);
-  EXPECT_NO_THROW(_execution->setOutput(output, operand_type, operand_shape,
-                                        reinterpret_cast<void *>(output_buffer), 16));
-}
-
-TEST_F(InterpExecutorTest, execute)
-{
-  CreateSimpleModel();
-  createExecution();
-
-  auto input1 = IOIndex{0};
-  auto input2 = IOIndex{1};
-  auto input1_idx = _graph->getInputs().at(input1);
-  auto input2_idx = _graph->getInputs().at(input2);
-
-  const int32_t input1_buffer[4] = {1, 0, -1, -2};
-  const int32_t input2_buffer[4] = {1, -3, 2, -4};
-
-  auto output = IOIndex{0};
-  auto output_idx = _graph->getOutputs().at(output);
-
-  int32_t output_buffer[4] = {};
-
-  EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16));
-  EXPECT_NO_THROW(_execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16));
-  EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16));
-  EXPECT_NO_THROW(_execution->execute());
-  EXPECT_EQ(output_buffer[0], 2);
-  EXPECT_EQ(output_buffer[1], -3);
-  EXPECT_EQ(output_buffer[2], 1);
-  EXPECT_EQ(output_buffer[3], -6);
-}
-
-TEST_F(InterpExecutorTest, executeTwoStep)
-{
-  CreateTwoStepModel();
-  createExecution();
-
-  auto input1 = IOIndex{0};
-  auto input2 = IOIndex{1};
-  auto input1_idx = _graph->getInputs().at(input1);
-  auto input2_idx = _graph->getInputs().at(input2);
-
-  const int32_t input1_buffer[4] = {1, 0, -1, -2};
-  const int32_t input2_buffer[4] = {1, -3, 2, -4};
-
-  auto output = IOIndex{0};
-  auto output_idx = _graph->getOutputs().at(output);
-
-  int32_t output_buffer[4] = {};
-
-  EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16));
-  EXPECT_NO_THROW(_execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16));
-  EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16));
-  EXPECT_NO_THROW(_execution->execute());
-  EXPECT_EQ(output_buffer[0], 5);
-  EXPECT_EQ(output_buffer[1], -2);
-  EXPECT_EQ(output_buffer[2], 0);
-  EXPECT_EQ(output_buffer[3], -1);
-}
-
-} // namespace
diff --git a/runtime/onert/core/src/interp/InterpOps.lst b/runtime/onert/core/src/interp/InterpOps.lst
deleted file mode 100644
index 0714df38a..000000000
--- a/runtime/onert/core/src/interp/InterpOps.lst
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INTERP_OP
-#error Define INTERP_OP before including this file
-#endif
-
-// Supported operation name in interpreter
-//
-// Same list with Operations.lst
-// Make comment out if operation is not supported in interpreter
-INTERP_OP(BinaryArithmetic)
-//INTERP_OP(BatchToSpaceND)
-//INTERP_OP(Cast)
-INTERP_OP(Conv2D)
-INTERP_OP(DepthwiseConv2D)
-INTERP_OP(Pool2D)
-INTERP_OP(Concat)
-INTERP_OP(FullyConnected)
-//INTERP_OP(Reduce)
-INTERP_OP(Reshape)
-INTERP_OP(Softmax)
-//INTERP_OP(Squeeze)
-//INTERP_OP(Slice)
-//INTERP_OP(StridedSlice)
-INTERP_OP(ElementwiseActivation)
-//INTERP_OP(Transpose)
-//INTERP_OP(Exp)
-//INTERP_OP(Comparison)
-//INTERP_OP(LogicalNot)
-//INTERP_OP(LSTM)
-//INTERP_OP(RSQRT)
-//INTERP_OP(ResizeBilinear)
-//INTERP_OP(RNN)
-//INTERP_OP(Floor)
-//INTERP_OP(SpaceToBatchND)
-//INTERP_OP(SpaceToDepth)
-//INTERP_OP(EmbeddingLookup)
-//INTERP_OP(L2Normalization)
-//INTERP_OP(HashtableLookup)
-INTERP_OP(InstanceNorm)
-//INTERP_OP(PReLU)
-INTERP_OP(TransposeConv)
-//INTERP_OP(SQRT)
-//INTERP_OP(SquaredDifference)
-//INTERP_OP(TopKV2)
-INTERP_OP(Gather)
-//INTERP_OP(Neg)
-//INTERP_OP(Abs)
-//INTERP_OP(ArgMax)
-//INTERP_OP(Dequantize)
-//INTERP_OP(LocalResponseNormalization)
-//INTERP_OP(DepthToSpace)
-//INTERP_OP(Pack)
-//INTERP_OP(Split)
-//INTERP_OP(Unpack)
-INTERP_OP(Pad)
-//INTERP_OP(Custom)
-//INTERP_OP(Permute)
-//INTERP_OP(OneHot)
diff --git a/runtime/onert/core/src/interp/Interpreter.cc b/runtime/onert/core/src/interp/Interpreter.cc
deleted file mode 100644
index e01afb8a6..000000000
--- a/runtime/onert/core/src/interp/Interpreter.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Interpreter.h"
-
-#include <stack>
-#include <unordered_set>
-
-#include "Registration.h"
-
-#include "ir/OperandIndexMap.h"
-#include "util/logging.h"
-#include "ir/OperationVisitor.h"
-
-namespace onert
-{
-namespace interp
-{
-
-// TODO more structured execution kernel implementation
-// TODO use cker for execution
-// TODO divide tensor prepare and execution
-// TODO introduce memory manager (buffer allocate and free)
-class OperationExecutor
-{
-public:
-  OperationExecutor(ExecEnv *env) : _env{env}
-  {
-#define INTERP_OP(InternalName) _kernels[ir::OpCode::InternalName] = get##InternalName();
-#include "InterpOps.lst"
-#undef INTERP_OP
-  }
-
-  void execute(const ir::OperationIndex &idx)
-  {
-    const ir::Operation &node = _env->graph().operations().at(idx);
-    const auto nodeName = node.name();
-    VERBOSE(INTERPRETER) << "Prepare output operands and execute " << nodeName
-                         << " operation (id: " << idx << ")" << std::endl;
-
-    const auto nodeOpCode = node.opcode();
-    if (_kernels.find(nodeOpCode) == _kernels.end())
-    {
-      throw std::runtime_error{"Interpreter: Operation " + nodeName + " is not yet implemented"};
-    }
-
-    if (_kernels[nodeOpCode]->prepare != nullptr)
-    {
-      _kernels[nodeOpCode]->prepare(_env, node);
-    }
-    _kernels[nodeOpCode]->invoke(_env, node);
-  }
-
-private:
-  ExecEnv *_env;
-  std::unordered_map<ir::OpCode, OpKernel *> _kernels;
-};
-
-void Interpreter::run()
-{
-  VERBOSE(INTERPRETER) << "Interpreter is invoked " << std::endl;
-
-  // operand_stack: save operands prepared to use
-  std::stack<ir::OperandIndex> operand_stack;
-
-  // Note: We should push input first, then constant.
-  //       We use use-def for find operators ready to execution,
-  //       but Use-Def cannot handle parameters (maybe constant, but not always)
-  // Note: If all model inputs are constant, it may not work (depend on tensors' order).
-  //       But that scenario may not exist
-  for (auto ind : _env->graph().getInputs())
-  {
-    VERBOSE(INTERPRETER) << "Input: Push to operand stack " << ind << std::endl;
-
-    operand_stack.push(ind);
-  }
-
-  _env->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
-    if (obj.isConstant())
-    {
-      VERBOSE(INTERPRETER) << "Constant: Push to operand stack " << ind << std::endl;
-
-      operand_stack.push(ind);
-    }
-  });
-
-  // Execution
-  std::unordered_set<ir::OperandIndex> ready_check;
-  std::unordered_set<ir::OperationIndex> executed;
-  OperationExecutor executor{_env.get()};
-  while (!operand_stack.empty())
-  {
-    const auto current_operand_index = operand_stack.top();
-    operand_stack.pop();
-    VERBOSE(INTERPRETER) << "Poped operand " << current_operand_index.value()
-                         << " is checked ready to use" << std::endl;
-
-    assert(ready_check.find(current_operand_index) == ready_check.end());
-    ready_check.insert(current_operand_index);
-
-    // Find prepared operations by scan use of current operand
-    std::stack<ir::OperationIndex> operation_stack;
-    const auto use_operators = _env->graph().operands().at(current_operand_index).getUses();
-    for (const auto &use_operator : use_operators)
-    {
-      // Assumption: all parameters are ready to use
-      bool operator_ready = true;
-      for (auto input_index : _env->graph().operations().at(use_operator).getInputs())
-      {
-        if (ready_check.find(input_index) == ready_check.end())
-        {
-          operator_ready = false;
-          break;
-        }
-      }
-
-      if (operator_ready)
-      {
-        VERBOSE(INTERPRETER) << "Ready to execute operation " << use_operator << std::endl;
-        operation_stack.push(use_operator);
-      }
-    }
-
-    while (!operation_stack.empty())
-    {
-      const auto current_operation_index = operation_stack.top();
-      operation_stack.pop();
-      VERBOSE(INTERPRETER) << "Poped operation: " << current_operation_index << "("
-                           << _env->graph().operations().at(current_operation_index).name() << ")"
-                           << std::endl;
-
-      // execution
-      // 1. Prepare output tensor
-      // 2. Call operation kernel
-      executor.execute(current_operation_index);
-      executed.insert(current_operation_index);
-
-      // 3. Push each output into operand stack
-      const auto def_operands = _env->graph().operations().at(current_operation_index).getOutputs();
-      for (auto def_operand : def_operands)
-      {
-        VERBOSE(INTERPRETER) << "Buffer: Push to operand stack " << def_operand.value()
-                             << std::endl;
-        operand_stack.push(def_operand);
-      }
-
-      // 4. Free if lifetime of buffer operands used by input is finished
-      for (auto input_index : _env->graph().operations().at(current_operation_index).getInputs())
-      {
-        const auto use_operators = _env->graph().operands().at(input_index).getUses();
-        bool dead_buffer = true;
-        for (const auto &use_operator : use_operators)
-        {
-          if (executed.find(use_operator) == executed.end())
-          {
-            dead_buffer = false;
-            break;
-          }
-        }
-
-        if (dead_buffer)
-        {
-          _env->freeIfAllocated(input_index);
-        }
-      }
-    }
-  }
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/Interpreter.h b/runtime/onert/core/src/interp/Interpreter.h
deleted file mode 100644
index d2165f538..000000000
--- a/runtime/onert/core/src/interp/Interpreter.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file  Interpreter.h
- * @brief This file contains Interpreter class for interpretation
- */
-#ifndef __ONERT_INTERP_INTERPRETER_H__
-#define __ONERT_INTERP_INTERPRETER_H__
-
-#include "ExecEnv.h"
-
-namespace onert
-{
-namespace interp
-{
-
-/**
- * @brief Class for interpretation
- */
-class Interpreter
-{
-
-public:
-  /**
-   * @brief Construct a new Interpreter object (deleted)
-   */
-  Interpreter() = delete;
-  /**
-   * @brief     Construct a new Interpreter object
-   * @param[in] env Execution environment variable for interpreter object
-   */
-  Interpreter(std::unique_ptr<ExecEnv> env) : _env{std::move(env)}
-  {
-    // DO NOTHING
-  }
-
-public:
-  /**
-   * @brief Run interpreter until there is no operation to execute
-   */
-  void run();
-
-private:
-  std::unique_ptr<ExecEnv> _env;
-};
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_INTERPRETER_H__
diff --git a/runtime/onert/core/src/interp/Registration.h b/runtime/onert/core/src/interp/Registration.h
deleted file mode 100644
index 956b92a53..000000000
--- a/runtime/onert/core/src/interp/Registration.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_INTERP_REGISTRATION_H__
-#define __ONERT_INTERP_REGISTRATION_H__
-
-#include "ExecEnv.h"
-
-#include "ir/Operation.h"
-
-namespace onert
-{
-namespace interp
-{
-
-struct OpKernel
-{
-  std::function<void(ExecEnv *, const ir::Operation &)> prepare;
-  std::function<void(const ExecEnv *, const ir::Operation &)> invoke;
-};
-
-// Defined in operations/ directory
-#define INTERP_OP(InternalName) OpKernel *get##InternalName();
-#include "InterpOps.lst"
-#undef INTERP_OP
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_REGISTRATION_H__
diff --git a/runtime/onert/core/src/interp/Tensor.cc b/runtime/onert/core/src/interp/Tensor.cc
deleted file mode 100644
index de095c9e4..000000000
--- a/runtime/onert/core/src/interp/Tensor.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Tensor.h"
-
-#define NO_USE(a) (void)(a)
-
-namespace onert
-{
-namespace interp
-{
-
-void ITensor::access(const std::function<void(backend::ITensor &tensor)> &fn) { fn(*this); }
-
-size_t ROTensor::calcOffset(const ir::Coordinates &coords) const
-{
-  NO_USE(coords);
-  throw std::runtime_error("offset_element_in_bytes is not supported for cpu::Tensor now.");
-}
-
-size_t Tensor::calcOffset(const ir::Coordinates &coords) const
-{
-  NO_USE(coords);
-  throw std::runtime_error("offset_element_in_bytes is not supported for cpu::Tensor now.");
-}
-
-ir::Layout ROTensor::layout() const
-{
-  // TODO Changes to return frontend layout
-  return ir::Layout::NHWC;
-}
-
-ir::Layout Tensor::layout() const
-{
-  // TODO Changes to return frontend layout
-  return ir::Layout::NHWC;
-}
-
-ir::Shape Tensor::getShape() const { return _info.shape(); }
-
-ir::Shape ROTensor::getShape() const { return _info.shape(); }
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h
deleted file mode 100644
index 642fdc164..000000000
--- a/runtime/onert/core/src/interp/Tensor.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file  Tensor.h
- * @brief This file contains ITensor interface, ROTensor class, and Tensor class
- */
-#ifndef __ONERT_INTERP_TENSOR_H__
-#define __ONERT_INTERP_TENSOR_H__
-
-#include "Buffer.h"
-
-#include "ir/OperandInfo.h"
-#include "backend/ITensor.h"
-#include "ir/Layout.h"
-
-namespace onert
-{
-namespace interp
-{
-
-/**
- * @brief Interface to handle Tensor in interpreter
- */
-class ITensor : public backend::ITensor
-{
-public:
-  virtual ~ITensor() = default;
-
-public:
-  virtual uint8_t *buffer() const = 0;
-  /**
-   * @brief   Return shared pointer for buffer
-   * @return  Buffer shared pointer
-   */
-  virtual std::shared_ptr<const Buffer> shareBuffer() const = 0;
-  /**
-   * @brief   Return read-only buffer pointer
-   * @return  Read-only buffer pointer
-   */
-  virtual const uint8_t *bufferRO() const = 0;
-  /**
-   * @brief   Return shared pointer for data
-   * @return  Data shared pointer
-   */
-  virtual std::shared_ptr<const ir::Data> shareData() const = 0;
-  /**
-   * @brief     Set internal/external buffer
-   * @param[in] buffer  Buffer pointer
-   */
-  virtual void setBuffer(std::shared_ptr<const Buffer> buffer) = 0;
-  /**
-   * @brief     Set data reference (including constant, input)
-   * @param[in] data  Data pointer
-   */
-  virtual void setData(std::shared_ptr<const ir::Data> data) = 0;
-  virtual void releaseData() = 0;
-
-  virtual size_t total_size() const = 0;
-  virtual size_t calcOffset(const ir::Coordinates &coords) const = 0;
-
-  virtual bool has_padding() const = 0;
-  /**
-   * @brief   Return data type of tensor
-   * @return  Data type of tensor
-   */
-  virtual ir::DataType data_type() const = 0;
-  /**
-   * @brief   Return TensorInfo
-   * @return  TensorInfo
-   */
-  virtual const ir::OperandInfo &tensorInfo() const = 0;
-  /**
-   * @brief   Return number of elements
-   * @return  Number of elements
-   */
-  virtual uint64_t num_elements() const = 0;
-  void access(const std::function<void(backend::ITensor &tensor)> &fn) final;
-};
-
-/**
- * @brief Class to handle tensor in interpreter as read-only
- */
-class ROTensor final : public ITensor
-{
-public:
-  ROTensor() = delete;
-  ROTensor(const ir::OperandInfo &info) : _info(info)
-  {
-    // DO NOTHING
-  }
-
-public:
-  uint8_t *buffer() const override { throw std::runtime_error{"Read only tensor"}; }
-  std::shared_ptr<const Buffer> shareBuffer() const override
-  {
-    throw std::runtime_error{"Read only tensor"};
-  }
-  const uint8_t *bufferRO() const override { return _data->base(); }
-  std::shared_ptr<const ir::Data> shareData() const override { return _data; }
-  void setBuffer(std::shared_ptr<const Buffer> buffer) override { _data = buffer; }
-  void setData(std::shared_ptr<const ir::Data> data) override { _data = data; }
-  void releaseData() override { _data = nullptr; }
-
-  size_t total_size() const override { return _info.total_size(); }
-  size_t calcOffset(const ir::Coordinates &coords) const override;
-  ir::Layout layout() const override;
-  bool is_dynamic() const override { return false; }
-  bool has_padding() const override { return false; }
-  ir::DataType data_type() const override { return _info.typeInfo().type(); }
-  float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); }
-  const std::vector<float> &data_scales() const override { return _info.typeInfo().scales(); }
-  const std::vector<int32_t> &data_zero_points() const override
-  {
-    return _info.typeInfo().zero_points();
-  }
-  const ir::OperandInfo &tensorInfo() const override { return _info; }
-  uint64_t num_elements() const override { return _info.shape().num_elements(); };
-  ir::Shape getShape() const override;
-
-private:
-  const ir::OperandInfo _info;
-  std::shared_ptr<const ir::Data> _data{nullptr};
-};
-
-/**
- * @brief Class to handle tensor in interpreter as writable
- */
-class Tensor final : public ITensor
-{
-public:
-  Tensor() = delete;
-  Tensor(const ir::OperandInfo &info) : _info(info)
-  {
-    // DO NOTHING
-  }
-
-public:
-  uint8_t *buffer() const override { return _buffer->baseWritable(); }
-  std::shared_ptr<const Buffer> shareBuffer() const override { return _buffer; };
-  const uint8_t *bufferRO() const override { return _buffer->base(); }
-  std::shared_ptr<const ir::Data> shareData() const override { return _buffer; }
-  void setBuffer(std::shared_ptr<const Buffer> buffer) override { _buffer = buffer; }
-  void setData(std::shared_ptr<const ir::Data>) override
-  {
-    throw std::runtime_error{"Passed data may read-only"};
-  }
-  void releaseData() override { _buffer = nullptr; }
-
-  size_t total_size() const override { return _info.total_size(); }
-  size_t calcOffset(const ir::Coordinates &coords) const override;
-  ir::Layout layout() const override;
-  bool is_dynamic() const override { return false; }
-  bool has_padding() const override { return false; }
-  ir::DataType data_type() const override { return _info.typeInfo().type(); }
-  float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); }
-  const std::vector<float> &data_scales() const override { return _info.typeInfo().scales(); }
-  const std::vector<int32_t> &data_zero_points() const override
-  {
-    return _info.typeInfo().zero_points();
-  }
-  const ir::OperandInfo &tensorInfo() const override { return _info; }
-  uint64_t num_elements() const override { return _info.shape().num_elements(); };
-  ir::Shape getShape() const override;
-
-private:
-  const ir::OperandInfo _info;
-  std::shared_ptr<const Buffer> _buffer{nullptr};
-};
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_TENSOR_H__
diff --git a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc b/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
deleted file mode 100644
index fe4acd309..000000000
--- a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/BinaryArithmetic.h"
-
-#include <cker/operation/BinaryArithmeticOps.h>
-#include <cker/Types.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-enum class OpType
-{
-  ADD,
-  SUB,
-  MUL
-};
-
-void prepare(ExecEnv *env, const ir::Operation &node)
-{
-  const auto &arithmetic_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
-
-  const auto lhs_index = node.getInputs().at(arithmetic_node.LHS);
-  const auto rhs_index = node.getInputs().at(arithmetic_node.RHS);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto lhs_tensor = env->tensorAt(lhs_index);
-  const auto rhs_tensor = env->tensorAt(rhs_index);
-
-  // Check shape and type lhs is same with rhs
-  // TODO Util function to compare TensorInfo
-  if (lhs_tensor->data_type() != rhs_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Different input types"};
-  }
-
-  bool try_broadcast = (lhs_tensor->tensorInfo().shape() != rhs_tensor->tensorInfo().shape());
-  if (try_broadcast)
-  {
-    bool success = true;
-    auto out_shape = calcBroadcastShape(lhs_tensor->tensorInfo().shape(),
-                                        rhs_tensor->tensorInfo().shape(), success);
-    if (!success)
-    {
-      throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Fail to brodcasting"};
-    }
-
-    auto output_info =
-      ir::OperandInfo::createStaticInfo(out_shape, lhs_tensor->tensorInfo().typeInfo());
-    // We can handle already allocated (ex. model output)
-    env->allocateIfNeeded(out_index, output_info);
-  }
-  else
-  {
-    // Output's shape and type is same with input
-    auto output_info = lhs_tensor->tensorInfo();
-    // We can handle already allocated (ex. model output)
-    env->allocateIfNeeded(out_index, output_info);
-  }
-
-  auto out_tensor = env->tensorAt(out_index);
-  // Check shape and type lhs is same with output
-  // TODO Util function to compare TensorInfo
-  if (lhs_tensor->data_type() != out_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Invalid output type"};
-  }
-}
-
-inline void setActivationParams(float min, float max, nnfw::cker::BinaryArithmeticOpParam *params)
-{
-  params->float_activation_min = min;
-  params->float_activation_max = max;
-}
-
-inline void setActivationParams(int32_t min, int32_t max,
-                                nnfw::cker::BinaryArithmeticOpParam *params)
-{
-  params->quantized_activation_min = min;
-  params->quantized_activation_max = max;
-}
-
-template <typename raw_type, OpType op_type>
-void invoke(const ITensor *lhs_tensor, const ITensor *rhs_tensor, const ITensor *out_tensor,
-            const ir::operation::BinaryArithmetic::Param &param)
-{
-  const auto lhs_buffer = lhs_tensor->bufferRO();
-  const auto rhs_buffer = rhs_tensor->bufferRO();
-  auto out_buffer = out_tensor->buffer();
-
-  nnfw::cker::BinaryArithmeticOpParam cker_param;
-  raw_type activation_min, activation_max;
-  calculateActivationRange(param.activation, &activation_min, &activation_max);
-  setActivationParams(activation_min, activation_max, &cker_param);
-  const raw_type *lhs_ptr = reinterpret_cast<const raw_type *>(lhs_buffer);
-  const raw_type *rhs_ptr = reinterpret_cast<const raw_type *>(rhs_buffer);
-  raw_type *out_ptr = reinterpret_cast<raw_type *>(out_buffer);
-
-  const auto cker_op_type =
-    (op_type == OpType::ADD) ? nnfw::cker::BinaryArithmeticOpType::ADD
-                             : ((op_type == OpType::SUB) ? nnfw::cker::BinaryArithmeticOpType::SUB
-                                                         : nnfw::cker::BinaryArithmeticOpType::MUL);
-
-  const bool need_broadcast =
-    nnfw::cker::ProcessBroadcastShapes(convertShape(lhs_tensor->tensorInfo().shape()),
-                                       convertShape(rhs_tensor->tensorInfo().shape()), &cker_param);
-
-  if (need_broadcast)
-  {
-    const auto lhs_shape = convertShape(lhs_tensor->tensorInfo().shape());
-    const auto rhs_shape = convertShape(rhs_tensor->tensorInfo().shape());
-    const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
-    nnfw::cker::BroadcastBinaryArithmeticOp<cker_op_type>(cker_param, lhs_shape, lhs_ptr, rhs_shape,
-                                                          rhs_ptr, out_shape, out_ptr);
-    return;
-  }
-
-  const auto lhs_shape = convertShape(lhs_tensor->tensorInfo().shape());
-  const auto rhs_shape = convertShape(rhs_tensor->tensorInfo().shape());
-  const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
-  nnfw::cker::BinaryArithmeticOp<cker_op_type>(cker_param, lhs_shape, lhs_ptr, rhs_shape, rhs_ptr,
-                                               out_shape, out_ptr);
-}
-
-template <OpType op_type>
-void invokeBinaryArithmetic(const ExecEnv *env, const ir::operation::BinaryArithmetic &node)
-{
-  const auto lhs_index = node.getInputs().at(node.LHS);
-  const auto rhs_index = node.getInputs().at(node.RHS);
-  const auto out_index = node.getOutputs().at(0);
-  const auto lhs_tensor = env->tensorAt(lhs_index);
-  const auto rhs_tensor = env->tensorAt(rhs_index);
-  const auto out_tensor = env->tensorAt(out_index);
-  const auto data_type = lhs_tensor->data_type();
-
-  if (data_type == ir::DataType::INT32)
-  {
-    invoke<int32_t, op_type>(lhs_tensor, rhs_tensor, out_tensor, node.param());
-  }
-  else if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke<float, op_type>(lhs_tensor, rhs_tensor, out_tensor, node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Unsupported data type"};
-  }
-}
-
-void invokeBinaryArithmeticOps(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &arithmetic_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
-
-  switch (arithmetic_node.param().arithmetic_type)
-  {
-    case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
-      invokeBinaryArithmetic<OpType::ADD>(env, arithmetic_node);
-      break;
-    case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
-      invokeBinaryArithmetic<OpType::SUB>(env, arithmetic_node);
-      break;
-    case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
-      invokeBinaryArithmetic<OpType::MUL>(env, arithmetic_node);
-      break;
-    default:
-      throw std::runtime_error{"Interp(BinaryArithmetic): NYI unsupported operation " +
-                               arithmetic_node.name()};
-      break;
-  }
-}
-
-} // namespace
-
-OpKernel *getBinaryArithmetic()
-{
-  static OpKernel kernel = {prepare, invokeBinaryArithmeticOps};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Concat.cc b/runtime/onert/core/src/interp/operations/Concat.cc
deleted file mode 100644
index 103604631..000000000
--- a/runtime/onert/core/src/interp/operations/Concat.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Concat.h"
-
-#include <cker/operation/Concatenation.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace concat
-{
-
-void prepareConcat(ExecEnv *env, const ir::Operation &node)
-{
-  const auto &concat_node = nnfw::misc::polymorphic_downcast<const ir::operation::Concat &>(node);
-
-  const auto first_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto first_tensor = env->tensorAt(first_index);
-  uint32_t out_axis_dimension = 0;
-  const int32_t axis_raw = concat_node.param().axis;
-  const int32_t axis = (axis_raw < 0) ? (axis_raw + first_tensor->getShape().rank()) : axis_raw;
-
-  // All inputs shape should be same except axis dimension
-  // All inputs type should be same
-  for (auto input : node.getInputs())
-  {
-    assert(first_tensor->getShape().rank() == env->tensorAt(input)->getShape().rank());
-    assert(first_tensor->data_type() == env->tensorAt(input)->data_type());
-    for (int i = 0; i < first_tensor->getShape().rank(); i++)
-    {
-      if (i == axis)
-      {
-        out_axis_dimension += env->tensorAt(input)->getShape().dim(i);
-        continue;
-      }
-      assert(first_tensor->getShape().dim(i) == env->tensorAt(input)->getShape().dim(i));
-    }
-  }
-
-  // Make output tensor info using first input tensor info, and accumulated axis dimension value
-  auto out_shape = first_tensor->tensorInfo().shape();
-  out_shape.dim(axis) = out_axis_dimension;
-  env->allocateIfNeeded(
-    out_index, ir::OperandInfo::createStaticInfo(out_shape, first_tensor->tensorInfo().typeInfo()));
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Output shape should be same with input except axis getShape().dim
-  // Output type should be same with input
-  assert(first_tensor->data_type() == out_tensor->data_type());
-  for (int i = 0; i < first_tensor->getShape().rank(); i++)
-  {
-    if (i == axis)
-    {
-      continue;
-    }
-    assert(first_tensor->getShape().dim(i) == out_tensor->getShape().dim(i));
-  }
-}
-
-void invoke(const std::vector<const ITensor *> in_tensors, const ITensor *out_tensor, uint32_t axis)
-{
-  const uint32_t count = in_tensors.size();
-
-  // Calculate
-  nnfw::cker::ConcatenationParams cker_param;
-  cker_param.axis = (int8_t)axis;
-  cker_param.inputs_count = count;
-
-  const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
-
-  std::vector<nnfw::cker::Shape> in_shapes;
-  std::vector<const nnfw::cker::Shape *> in_shape_ptrs;
-  in_shapes.reserve(count);
-  in_shape_ptrs.reserve(count);
-  std::vector<const float *> in_ptrs;
-  for (uint32_t i = 0; i < count; i++)
-  {
-    in_shapes.push_back(convertShape(in_tensors[i]->tensorInfo().shape()));
-    in_shape_ptrs.push_back(&in_shapes[i]);
-    in_ptrs.push_back(reinterpret_cast<const float *>(in_tensors[i]->bufferRO()));
-  }
-
-  auto out_buffer = out_tensor->buffer();
-  float *out_ptr = reinterpret_cast<float *>(out_buffer);
-
-  nnfw::cker::Concatenation<float>(cker_param, in_shape_ptrs.data(), in_ptrs.data(), out_shape,
-                                   out_ptr);
-}
-
-void invokeConcat(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &concat_node = nnfw::misc::polymorphic_downcast<const ir::operation::Concat &>(node);
-  const int32_t axis_raw = concat_node.param().axis;
-
-  std::vector<const ITensor *> in_tensors;
-  for (const auto &e : concat_node.getInputs())
-  {
-    in_tensors.emplace_back(env->tensorAt(e));
-  }
-
-  const auto out_index = node.getOutputs().at(0);
-  const auto out_tensor = env->tensorAt(out_index);
-  const uint32_t axis = (axis_raw < 0) ? (axis_raw + out_tensor->getShape().rank()) : axis_raw;
-
-  const auto data_type = in_tensors[0]->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(in_tensors, out_tensor, axis);
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float32 only"};
-  }
-}
-} // namespace concat
-
-OpKernel *getConcat()
-{
-  static OpKernel kernel = {concat::prepareConcat, concat::invokeConcat};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Conv2D.cc b/runtime/onert/core/src/interp/operations/Conv2D.cc
deleted file mode 100644
index 72c2057c2..000000000
--- a/runtime/onert/core/src/interp/operations/Conv2D.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Conv2D.h"
-#include "util/ShapeInference.h"
-#include "util/Utils.h"
-
-#include <cker/operation/Conv.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace conv2d
-{
-
-void prepareConv2D(ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(ir::operation::Conv2D::INPUT);
-  const auto kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
-  const auto bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  const auto kernel_tensor = env->tensorAt(kernel_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-
-  assert(in_tensor->getShape().rank() == 4);
-  assert(kernel_tensor->getShape().rank() == 4);
-  assert(bias_tensor->getShape().rank() == 1);
-
-  UNUSED_RELEASE(in_tensor);
-  UNUSED_RELEASE(kernel_tensor);
-  UNUSED_RELEASE(bias_tensor);
-
-  const auto output_info = env->graph().operands().at(out_index).info();
-  if (output_info.total_size() == 0)
-  {
-    // Handle unspecified output shape
-    const auto &conv_node = nnfw::misc::polymorphic_downcast<const ir::operation::Conv2D &>(node);
-    const auto infered_output_shape = shape_inference::inferConv2DShape(
-      in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(), conv_node.param());
-    env->allocateIfNeeded(
-      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
-  }
-  else
-  {
-    env->allocateIfNeeded(out_index, output_info);
-  }
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Handle same ifm & ofm data type only
-  assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->getShape().rank() == 4);
-}
-
-void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
-            const ITensor *ofm_tensor, const ir::operation::Conv2D::Param &param)
-{
-  // TODO Support NCHW frontned
-  const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
-  const auto &ker_shape = ker_tensor->tensorInfo().shape();
-  const auto ker_height = ker_shape.dim(1);
-  const auto ker_width = ker_shape.dim(2);
-  const auto padding =
-    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height);
-
-  // Calculate
-  float activation_min, activation_max;
-  calculateActivationRange(param.activation, &activation_min, &activation_max);
-
-  nnfw::cker::ConvParams cker_param;
-  cker_param.padding_type = convertPaddingType(param.padding.type);
-  cker_param.padding_values.width = padding.left;
-  cker_param.padding_values.height = padding.top;
-  cker_param.stride_width = param.stride.horizontal;
-  cker_param.stride_height = param.stride.vertical;
-  cker_param.dilation_width_factor = 1;
-  cker_param.dilation_height_factor = 1;
-  cker_param.float_activation_min = activation_min;
-  cker_param.float_activation_max = activation_max;
-
-  const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape());
-  const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape());
-  const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape());
-  const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape());
-  const float *ifm_ptr = reinterpret_cast<const float *>(ifm_tensor->bufferRO());
-  const float *ker_ptr = reinterpret_cast<const float *>(ker_tensor->bufferRO());
-  const float *bias_ptr = reinterpret_cast<const float *>(bias_tensor->bufferRO());
-  float *ofm_ptr = reinterpret_cast<float *>(ofm_tensor->buffer());
-
-  nnfw::cker::Conv conv_kernel;
-  conv_kernel(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr, cker_bias_shape,
-              bias_ptr, cker_ofm_shape, ofm_ptr);
-}
-
-void invokeConv2D(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &conv_node = nnfw::misc::polymorphic_downcast<const ir::operation::Conv2D &>(node);
-
-  const auto ifm_index = node.getInputs().at(ir::operation::Conv2D::INPUT);
-  const auto ker_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
-  const auto bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
-  const auto ofm_index = node.getOutputs().at(0);
-
-  const auto ifm_tensor = env->tensorAt(ifm_index);
-  const auto ker_tensor = env->tensorAt(ker_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-  const auto ofm_tensor = env->tensorAt(ofm_index);
-
-  const auto data_type = ifm_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float32 only"};
-  }
-}
-} // namespace conv2d
-
-OpKernel *getConv2D()
-{
-  static OpKernel kernel = {conv2d::prepareConv2D, conv2d::invokeConv2D};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
deleted file mode 100644
index 9f527440e..000000000
--- a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/DepthwiseConv2D.h"
-#include "util/ShapeInference.h"
-#include "util/Utils.h"
-
-#include <cker/operation/DepthwiseConv.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-
-namespace
-{
-
-void prepareDepthwiseConv(ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(ir::operation::DepthwiseConv2D::INPUT);
-  const auto kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
-  const auto bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  const auto kernel_tensor = env->tensorAt(kernel_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-
-  assert(in_tensor->getShape().rank() == 4);
-  assert(kernel_tensor->getShape().rank() == 4);
-  assert(bias_tensor->getShape().rank() == 1);
-
-  UNUSED_RELEASE(in_tensor);
-  UNUSED_RELEASE(kernel_tensor);
-  UNUSED_RELEASE(bias_tensor);
-
-  // TODO handle unspecified output shape:
-  //      calculate output shape using ifm shape, kernel shape, padding, stride
-  const auto output_info = env->graph().operands().at(out_index).info();
-  if (output_info.total_size() == 0)
-  {
-    // Handle unspecified output shape
-    const auto &depth_conv_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::DepthwiseConv2D &>(node);
-    const auto infered_output_shape = shape_inference::inferDepthwiseConv2DShape(
-      in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(),
-      depth_conv_node.param());
-    env->allocateIfNeeded(
-      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
-  }
-  else
-  {
-    env->allocateIfNeeded(out_index, output_info);
-  }
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Handle same ifm & ofm data type only
-  assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->getShape().rank() == 4);
-}
-
-void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
-            const ITensor *ofm_tensor, const ir::operation::DepthwiseConv2D::Param &param)
-{
-  // TODO Support NCHW frontend
-  const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  // Kernel format is [1, kernel_height, kernel_width, depth_out].
-  const auto &ker_shape = ker_tensor->tensorInfo().shape();
-  const auto ker_height = ker_shape.dim(1);
-  const auto ker_width = ker_shape.dim(2);
-  const auto padding =
-    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height);
-
-  // Calculate
-  float activation_min, activation_max;
-  calculateActivationRange(param.activation, &activation_min, &activation_max);
-
-  nnfw::cker::DepthwiseConvParams cker_param;
-  cker_param.padding_values.width = padding.left;
-  cker_param.padding_values.height = padding.top;
-  cker_param.depth_multiplier = param.multiplier;
-  cker_param.stride_width = param.stride.horizontal;
-  cker_param.stride_height = param.stride.vertical;
-  cker_param.dilation_width_factor = 1;
-  cker_param.dilation_height_factor = 1;
-  cker_param.float_activation_min = activation_min;
-  cker_param.float_activation_max = activation_max;
-
-  const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape());
-  const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape());
-  const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape());
-  const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape());
-  const float *ifm_ptr = reinterpret_cast<const float *>(ifm_tensor->bufferRO());
-  const float *ker_ptr = reinterpret_cast<const float *>(ker_tensor->bufferRO());
-  const float *bias_ptr = reinterpret_cast<const float *>(bias_tensor->bufferRO());
-  float *ofm_ptr = reinterpret_cast<float *>(ofm_tensor->buffer());
-
-  nnfw::cker::DepthwiseConv(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr,
-                            cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr, nullptr);
-}
-
-void invokeDepthwiseConv(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &conv_node = static_cast<const ir::operation::DepthwiseConv2D &>(node);
-
-  const auto ifm_index = node.getInputs().at(ir::operation::DepthwiseConv2D::INPUT);
-  const auto ker_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
-  const auto bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
-  const auto ofm_index = node.getOutputs().at(0);
-
-  const auto ifm_tensor = env->tensorAt(ifm_index);
-  const auto ker_tensor = env->tensorAt(ker_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-  const auto ofm_tensor = env->tensorAt(ofm_index);
-
-  const auto data_type = ifm_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float32 only"};
-  }
-}
-
-} // namespace
-
-OpKernel *getDepthwiseConv2D()
-{
-  static OpKernel kernel = {prepareDepthwiseConv, invokeDepthwiseConv};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc b/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
deleted file mode 100644
index e13080e76..000000000
--- a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/ElementwiseActivation.h"
-
-#include <cker/operation/Logistic.h>
-#include <cker/operation/Tanh.h>
-#include <misc/polymorphic_downcast.h>
-
-#include <cmath>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-enum class ActivationType
-{
-  Logistic,
-  ReLU,
-  Tanh
-};
-
-void prepare(ExecEnv *env, const ir::Operation &node)
-{
-  const auto input_index = node.getInputs().at(0);
-  const auto output_index = node.getOutputs().at(0);
-
-  const auto input_tensor = env->tensorAt(input_index);
-
-  const auto output_info = env->graph().operands().at(output_index).info();
-  if (output_info.total_size() == 0)
-  {
-    // Output's shape and type is same with input
-    auto input_info = input_tensor->tensorInfo();
-    // We can handle already allocated (ex. model output)
-    env->allocateIfNeeded(output_index, input_info);
-  }
-  else
-  {
-    env->allocateIfNeeded(output_index, output_info);
-  }
-
-  const auto output_tensor = env->tensorAt(output_index);
-  // Check shape and type lhs is same with output
-  // TODO Util function to compare TensorInfo
-  if (input_tensor->data_type() != output_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(ElementwiseActivation): Invalid output type"};
-  }
-}
-
-template <ActivationType act_type>
-void evalFloat(const float *input_ptr, float *output_ptr, uint64_t num_elements, float alpha,
-               float beta)
-{
-  std::function<float(const float &)> fn = [](const float &) { return std::nanf(""); };
-  switch (act_type)
-  {
-    case ActivationType::ReLU:
-      fn = [alpha, beta](const float &in) { return std::min(std::max(beta, in), alpha); };
-      break;
-    case ActivationType::Tanh:
-      fn = [](const float &in) { return std::tanh(in); };
-      break;
-    default:
-      throw std::runtime_error{"Interp(ElementwiseActivation): NYI - Unsupported activation"};
-      break;
-  }
-
-  const float *input_end = input_ptr + num_elements;
-  for (; input_ptr < input_end; input_ptr++, output_ptr++)
-  {
-    *output_ptr = fn(*input_ptr);
-  }
-}
-
-template <ActivationType act_type> void invoke(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto input_index = node.getInputs().at(0);
-  const auto output_index = node.getOutputs().at(0);
-
-  // Check lhs shape is same with rhs (with broadcast)
-  const auto input_tensor = env->tensorAt(input_index);
-  const auto output_tensor = env->tensorAt(output_index);
-
-  const auto data_type = input_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    uint64_t elements = input_tensor->num_elements();
-    const float *input_start = reinterpret_cast<const float *>(input_tensor->bufferRO());
-    float *out = reinterpret_cast<float *>(output_tensor->buffer());
-    if (act_type == ActivationType::Logistic)
-    {
-      const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape());
-      const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape());
-      nnfw::cker::Logistic(cker_input_shape, input_start, cker_output_shape, out);
-    }
-    else
-    {
-      const auto &act_node =
-        nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
-      evalFloat<act_type>(input_start, out, elements, act_node.param().alpha,
-                          act_node.param().beta);
-    }
-  }
-  else
-  {
-    throw std::runtime_error{"Interp(" + node.name() + "): NYI - Support float only"};
-  }
-}
-
-void invokeElementwiseActivation(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &act_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
-  switch (act_node.param().op_type)
-  {
-    case ir::operation::ElementwiseActivation::Type::LOGISTIC:
-      invoke<ActivationType::Logistic>(env, node);
-      break;
-    case ir::operation::ElementwiseActivation::Type::RELU:
-      invoke<ActivationType::ReLU>(env, node);
-      break;
-    case ir::operation::ElementwiseActivation::Type::TANH:
-      invoke<ActivationType::Tanh>(env, node);
-      break;
-    default:
-      throw std::runtime_error("Interp(" + node.name() + "): NYI - Unsupported activation");
-  }
-}
-
-} // namespace
-
-OpKernel *getElementwiseActivation()
-{
-  static OpKernel kernel = {prepare, invokeElementwiseActivation};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/FullyConnected.cc b/runtime/onert/core/src/interp/operations/FullyConnected.cc
deleted file mode 100644
index 2bc9f517f..000000000
--- a/runtime/onert/core/src/interp/operations/FullyConnected.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/FullyConnected.h"
-
-#include <cker/operation/FullyConnected.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace fc
-{
-
-void prepareFC(ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(ir::operation::FullyConnected::INPUT);
-  const auto kernel_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
-  const auto bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  const auto kernel_tensor = env->tensorAt(kernel_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-
-  UNUSED_RELEASE(in_tensor);
-  UNUSED_RELEASE(kernel_tensor);
-  UNUSED_RELEASE(bias_tensor);
-
-  assert(in_tensor->getShape().rank() >= 2);
-  assert(kernel_tensor->getShape().rank() == 2);
-  assert(bias_tensor->getShape().rank() == 1);
-
-  const auto input_size_with_batch = in_tensor->num_elements();
-  const auto num_units = kernel_tensor->getShape().dim(0);
-  const auto input_size = kernel_tensor->getShape().dim(1);
-  const int32_t batch_size = input_size_with_batch / input_size;
-  assert(input_size_with_batch % input_size == 0);
-  assert(num_units == bias_tensor->getShape().dim(0));
-
-  // Make output tensor info
-  ir::Shape output_shape(2);
-  output_shape.dim(0) = batch_size;
-  output_shape.dim(1) = num_units;
-  const auto out_info =
-    ir::OperandInfo::createStaticInfo(output_shape, in_tensor->tensorInfo().typeInfo());
-  env->allocateIfNeeded(out_index, out_info);
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Handle same ifm & ofm data type only
-  assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->getShape().rank() == 2);
-  assert(out_tensor->getShape().dim(0) == batch_size);
-  assert(out_tensor->getShape().dim(1) == num_units);
-}
-
-void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
-            const ITensor *ofm_tensor, const ir::operation::FullyConnected::Param &param)
-{
-  const auto ifm_buffer = ifm_tensor->bufferRO();
-  const auto ker_buffer = ker_tensor->bufferRO();
-  const auto bias_buffer = bias_tensor->bufferRO();
-  auto ofm_buffer = ofm_tensor->buffer();
-
-  // Calculate
-  nnfw::cker::FullyConnectedParams cker_param;
-  cker_param.activation = convertActivationType(param.activation);
-  const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape());
-  const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape());
-  const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape());
-  const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape());
-  const float *ifm_ptr = reinterpret_cast<const float *>(ifm_buffer);
-  const float *ker_ptr = reinterpret_cast<const float *>(ker_buffer);
-  const float *bias_ptr = reinterpret_cast<const float *>(bias_buffer);
-  float *ofm_ptr = reinterpret_cast<float *>(ofm_buffer);
-
-  nnfw::cker::FullyConnected(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr,
-                             cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr);
-}
-
-void invokeFC(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &conv_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::FullyConnected &>(node);
-
-  const auto ifm_index = node.getInputs().at(ir::operation::FullyConnected::INPUT);
-  const auto ker_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
-  const auto bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
-  const auto ofm_index = node.getOutputs().at(0);
-
-  const auto ifm_tensor = env->tensorAt(ifm_index);
-  const auto ker_tensor = env->tensorAt(ker_index);
-  const auto bias_tensor = env->tensorAt(bias_index);
-  const auto ofm_tensor = env->tensorAt(ofm_index);
-
-  const auto data_type = ifm_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float only"};
-  }
-}
-} // namespace fc
-
-OpKernel *getFullyConnected()
-{
-  static OpKernel kernel = {fc::prepareFC, fc::invokeFC};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Gather.cc b/runtime/onert/core/src/interp/operations/Gather.cc
deleted file mode 100644
index d686cfcf6..000000000
--- a/runtime/onert/core/src/interp/operations/Gather.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Gather.h"
-
-#include <cker/operation/Gather.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-void prepareGather(ExecEnv *env, const ir::Operation &node)
-{
-  const auto input_index = node.getInputs().at(ir::operation::Gather::INPUT);
-  const auto indices_index = node.getInputs().at(ir::operation::Gather::INDICES);
-  const auto output_index = node.getOutputs().at(0);
-
-  const auto input_tensor = env->tensorAt(input_index);
-  const auto indices_tensor = env->tensorAt(indices_index);
-
-  // TODO handle unspecified output shape:
-  //      calculate output shape using ifm shape, kernel shape, padding, stride
-  const auto output_info = env->graph().operands().at(output_index).info();
-  if (output_info.total_size() == 0)
-  {
-    throw std::runtime_error{"Interp(Gather): NYI for unspecified output shape"};
-  }
-  else
-  {
-    env->allocateIfNeeded(output_index, output_info);
-  }
-
-  if (indices_tensor->data_type() != ir::DataType::INT32)
-  {
-    throw std::runtime_error{"Interp(Gather): Invalid indices data type"};
-  }
-
-  auto output_tensor = env->tensorAt(output_index);
-  auto output_rank = input_tensor->getShape().rank() + indices_tensor->getShape().rank() - 1;
-
-  if (output_rank != output_tensor->getShape().rank())
-  {
-    throw std::runtime_error{"Interp(Gather): Invalid output rank"};
-  }
-  if (output_tensor->data_type() != input_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(Gather): Invalid output data type"};
-  }
-
-  if (input_tensor->data_type() == ir::DataType::QUANT_UINT8_ASYMM &&
-      input_tensor->tensorInfo().typeInfo() != output_tensor->tensorInfo().typeInfo())
-  {
-    throw std::runtime_error{
-      "Interp(Gather): Cannot handle different I/O QUANT_UINT8_ASYMM scale/offset"};
-  }
-}
-
-template <typename raw_type>
-void invoke(const ITensor *input_tensors, const ITensor *indices_tensors,
-            const ITensor *output_tensor, uint32_t axis)
-{
-  // Calculate
-  nnfw::cker::GatherParams cker_param;
-  cker_param.axis = (int8_t)axis;
-
-  const auto cker_input_shapes = convertShape(input_tensors->tensorInfo().shape());
-  const auto cker_indices_shape = convertShape(indices_tensors->tensorInfo().shape());
-  const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape());
-  const raw_type *input_ptr = reinterpret_cast<const raw_type *>(input_tensors->bufferRO());
-  const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(indices_tensors->bufferRO());
-  raw_type *output_ptr = reinterpret_cast<raw_type *>(output_tensor->buffer());
-
-  nnfw::cker::Gather<raw_type>(cker_param, cker_input_shapes, input_ptr, cker_indices_shape,
-                               indices_ptr, cker_output_shape, output_ptr);
-}
-
-void invokeGather(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &gather_node = nnfw::misc::polymorphic_downcast<const ir::operation::Gather &>(node);
-  const int32_t axis_raw = gather_node.param().axis;
-
-  const auto input_index = node.getInputs().at(ir::operation::Gather::INPUT);
-  const auto indices_index = node.getInputs().at(ir::operation::Gather::INDICES);
-  const auto output_index = node.getOutputs().at(0);
-
-  const auto input_tensor = env->tensorAt(input_index);
-  const auto indices_tensor = env->tensorAt(indices_index);
-  const auto output_tensor = env->tensorAt(output_index);
-  const uint32_t axis = (axis_raw < 0) ? (axis_raw + input_tensor->getShape().rank()) : axis_raw;
-
-  const auto data_type = input_tensor->data_type();
-
-  switch (data_type)
-  {
-    case ir::DataType::FLOAT32:
-      invoke<float>(input_tensor, indices_tensor, output_tensor, axis);
-      break;
-    case ir::DataType::INT32:
-      invoke<int32_t>(input_tensor, indices_tensor, output_tensor, axis);
-      break;
-    case ir::DataType::QUANT_UINT8_ASYMM:
-      invoke<uint8_t>(input_tensor, indices_tensor, output_tensor, axis);
-      break;
-    default:
-      throw std::runtime_error{"Interp(Gather): NYI - Not supported type"};
-  }
-}
-
-} // namespace
-
-OpKernel *getGather()
-{
-  static OpKernel kernel = {prepareGather, invokeGather};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/InstanceNorm.cc b/runtime/onert/core/src/interp/operations/InstanceNorm.cc
deleted file mode 100644
index 318088457..000000000
--- a/runtime/onert/core/src/interp/operations/InstanceNorm.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/InstanceNorm.h"
-
-#include <cker/operation/InstanceNorm.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace instancenorm
-{
-
-void prepareInstanceNorm(ExecEnv *env, const ir::Operation &node)
-{
-  const auto &instancenorm_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
-
-  const auto input_index = node.getInputs().at(instancenorm_node.INPUT);
-  const auto output_index = node.getOutputs().at(0);
-  const auto input_tensor = env->tensorAt(input_index);
-
-  if (input_tensor->getShape().rank() != 4)
-  {
-    throw std::runtime_error{"Interp(InstanceNorm): Input should be 4D-tensor"};
-  }
-
-  // Output shape should be same with input
-  env->allocateIfNeeded(output_index, input_tensor->tensorInfo());
-
-  auto output_tensor = env->tensorAt(output_index);
-  UNUSED_RELEASE(output_tensor);
-
-  // Handle same ifm & ofm data type only
-  assert(input_tensor->data_type() == output_tensor->data_type());
-  assert(input_tensor->tensorInfo().shape() == output_tensor->tensorInfo().shape());
-}
-
-inline void setActivationParams(float min, float max, nnfw::cker::InstanceNormParams *params)
-{
-  params->float_activation_min = min;
-  params->float_activation_max = max;
-}
-
-void invoke(const ITensor *input_tensor, const ITensor *gamma_tensor, const ITensor *beta_tensor,
-            const ITensor *output_tensor, const ir::operation::InstanceNorm::Param &param)
-{
-  // Calculate
-  float activation_min, activation_max;
-  calculateActivationRange(param.activation, &activation_min, &activation_max);
-
-  nnfw::cker::InstanceNormParams cker_param;
-  cker_param.epsilon = param.epsilon;
-  cker_param.float_activation_min = activation_min;
-  cker_param.float_activation_max = activation_max;
-
-  const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape());
-  const auto cker_gamma_shape = convertShape(gamma_tensor->tensorInfo().shape());
-  const auto cker_beta_shape = convertShape(beta_tensor->tensorInfo().shape());
-  const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape());
-  const float *input_ptr = reinterpret_cast<const float *>(input_tensor->bufferRO());
-  const float *gamma_ptr = reinterpret_cast<const float *>(gamma_tensor->bufferRO());
-  const float *beta_ptr = reinterpret_cast<const float *>(beta_tensor->bufferRO());
-  float *output_ptr = reinterpret_cast<float *>(output_tensor->buffer());
-
-  nnfw::cker::InstanceNorm(cker_param, cker_input_shape, input_ptr, cker_gamma_shape, gamma_ptr,
-                           cker_beta_shape, beta_ptr, cker_output_shape, output_ptr);
-}
-
-void invokeInstanceNorm(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &instancenorm_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
-
-  const auto input_index = node.getInputs().at(instancenorm_node.INPUT);
-  const auto gamma_index = node.getInputs().at(instancenorm_node.GAMMA);
-  const auto beta_index = node.getInputs().at(instancenorm_node.BETA);
-  const auto out_index = node.getOutputs().at(0);
-  const auto input_tensor = env->tensorAt(input_index);
-  const auto gamma_tensor = env->tensorAt(gamma_index);
-  const auto beta_tensor = env->tensorAt(beta_index);
-  const auto out_tensor = env->tensorAt(out_index);
-  const auto data_type = input_tensor->data_type();
-
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(input_tensor, gamma_tensor, beta_tensor, out_tensor, instancenorm_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Unsupported data type"};
-  }
-}
-} // namespace instancenorm
-
-OpKernel *getInstanceNorm()
-{
-  static OpKernel kernel = {instancenorm::prepareInstanceNorm, instancenorm::invokeInstanceNorm};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/OperationUtil.h b/runtime/onert/core/src/interp/operations/OperationUtil.h
deleted file mode 100644
index 2fdf098f0..000000000
--- a/runtime/onert/core/src/interp/operations/OperationUtil.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_
-#define __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_
-
-#include "ir/Shape.h"
-#include "ir/InternalType.h"
-#include "ir/Padding.h"
-
-#include <cker/Shape.h>
-#include <cker/Types.h>
-
-namespace onert
-{
-namespace interp
-{
-
-inline nnfw::cker::Shape convertShape(const ir::Shape &shape)
-{
-  auto dimensions = std::vector<uint32_t>(shape.dims().begin(), shape.dims().end());
-
-  std::vector<int32_t> raw_shape;
-  raw_shape.resize(dimensions.size());
-
-  for (uint32_t i = 0; i < dimensions.size(); ++i)
-  {
-    raw_shape[i] = dimensions[i];
-  }
-
-  return nnfw::cker::GetShape(raw_shape);
-}
-
-inline nnfw::cker::Shape convertExtendShape(const ir::Shape &shape)
-{
-  auto dimensions = std::vector<uint32_t>(shape.dims().begin(), shape.dims().end());
-
-  const int32_t extended_rank = 4;
-  int32_t raw_shape[extended_rank];
-  uint32_t start = extended_rank - dimensions.size();
-
-  for (uint32_t i = 0; i < extended_rank; ++i)
-  {
-    if (i < start)
-    {
-      raw_shape[i] = 1;
-    }
-    else
-    {
-      raw_shape[i] = dimensions[i - start];
-    }
-  }
-
-  return nnfw::cker::Shape(extended_rank, raw_shape);
-}
-
-inline nnfw::cker::FusedActivationFunctionType
-convertActivationType(const ir::Activation activation)
-{
-  switch (activation)
-  {
-    case ir::Activation::NONE:
-      return nnfw::cker::FusedActivationFunctionType::kNone;
-    case ir::Activation::RELU:
-      return nnfw::cker::FusedActivationFunctionType::kRelu;
-    case ir::Activation::RELU1:
-      return nnfw::cker::FusedActivationFunctionType::kRelu1;
-    case ir::Activation::RELU6:
-      return nnfw::cker::FusedActivationFunctionType::kRelu6;
-    default:
-      throw std::runtime_error{"CPU backend: Cannot convert activation type"};
-  }
-}
-
-template <typename T>
-void calculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
-{
-  if (activation == ir::Activation::RELU)
-  {
-    *activation_min = 0;
-    *activation_max = std::numeric_limits<T>::max();
-  }
-  else if (activation == ir::Activation::RELU6)
-  {
-    *activation_min = 0;
-    *activation_max = 6;
-  }
-  else if (activation == ir::Activation::RELU1)
-  {
-    *activation_min = -1;
-    *activation_max = 1;
-  }
-  else if (activation == ir::Activation::NONE)
-  {
-    *activation_min = std::numeric_limits<T>::lowest();
-    *activation_max = std::numeric_limits<T>::max();
-  }
-  else
-  {
-    throw std::runtime_error{"Unsupported activation type"};
-  }
-}
-
-inline ir::Shape calcBroadcastShape(const ir::Shape &lhs, const ir::Shape &rhs, bool &success)
-{
-  int lhs_rank = lhs.rank();
-  int rhs_rank = rhs.rank();
-
-  int out_rank = (lhs_rank > rhs_rank ? lhs_rank : rhs_rank);
-  ir::Shape out_shape(out_rank);
-
-  int lhs_idim = lhs_rank - 1;
-  int rhs_idim = rhs_rank - 1;
-  success = true;
-  for (int out_idim = out_rank - 1; out_idim >= 0; out_idim--)
-  {
-    if (lhs_idim == -1 && rhs_idim == -1)
-    {
-      // invalid result
-      success = false;
-      break;
-    }
-
-    if (lhs_idim == -1)
-    {
-      out_shape.dim(out_idim) = rhs.dim(rhs_idim);
-      rhs_idim--;
-    }
-    else if (rhs_idim == -1)
-    {
-      out_shape.dim(out_idim) = lhs.dim(lhs_idim);
-      lhs_idim--;
-    }
-    else
-    {
-      if (lhs.dim(lhs_idim) == rhs.dim(rhs_idim))
-      {
-        out_shape.dim(out_idim) = lhs.dim(lhs_idim);
-        lhs_idim--;
-        rhs_idim--;
-      }
-      else if (lhs.dim(lhs_idim) == 1)
-      {
-        out_shape.dim(out_idim) = rhs.dim(rhs_idim);
-        lhs_idim--;
-        rhs_idim--;
-      }
-      else if (rhs.dim(rhs_idim) == 1)
-      {
-        out_shape.dim(out_idim) = lhs.dim(lhs_idim);
-        lhs_idim--;
-        rhs_idim--;
-      }
-      else
-      {
-        // invalid result
-        success = false;
-        break;
-      }
-    }
-  }
-
-  if (lhs_idim != -1 || rhs_idim != -1)
-  {
-    // invalid result
-    success = false;
-  }
-  return out_shape;
-}
-
-inline nnfw::cker::PaddingType convertPaddingType(ir::PaddingType ir_padding_type)
-{
-  switch (ir_padding_type)
-  {
-    case ir::PaddingType::EXPLICIT:
-      return nnfw::cker::PaddingType::kNone;
-    case ir::PaddingType::SAME:
-      return nnfw::cker::PaddingType::kSame;
-    case ir::PaddingType::VALID:
-      return nnfw::cker::PaddingType::kValid;
-    default:
-      throw std::runtime_error("Wrong padding type.");
-      break;
-  }
-}
-
-} // namespace interp
-} // namespace onert
-
-#endif // __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_
diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
deleted file mode 100644
index 3db0828eb..000000000
--- a/runtime/onert/core/src/interp/operations/Pad.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Pad.h"
-
-#include <cker/operation/Pad.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-void preparePad(ExecEnv *env, const ir::Operation &node)
-{
-  const auto input_index = node.getInputs().at(ir::operation::Pad::INPUT);
-  const auto output_index = node.getOutputs().at(0);
-
-  const auto input_tensor = env->tensorAt(input_index);
-
-  const auto output_info = env->graph().operands().at(output_index).info();
-
-  // Check shape and type lhs is same with rhs
-  // TODO Util function to compare TensorInfo
-  if (output_info.total_size() == 0)
-  {
-    throw std::runtime_error{"Interp(Pad): NYI unspecified output shape"};
-  }
-  else
-  {
-    env->allocateIfNeeded(output_index, output_info);
-  }
-
-  const auto output_tensor = env->tensorAt(output_index);
-  if (input_tensor->data_type() != output_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(Pad): Invalid output type"};
-  }
-}
-
-void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITensor *output_tensor)
-{
-  const auto input_buffer = input_tensor->bufferRO();
-  const auto pad_buffer = pad_tensor->bufferRO();
-  auto output_buffer = output_tensor->buffer();
-
-  int32_t pad_rank = pad_tensor->getShape().dim(0);
-
-  const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape());
-  const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape());
-  const float *input_ptr = reinterpret_cast<const float *>(input_buffer);
-  const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
-  float *output_ptr = reinterpret_cast<float *>(output_buffer);
-
-  nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
-                         output_ptr, nullptr);
-}
-
-void invokePad(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto input_index = node.getInputs().at(ir::operation::Pad::INPUT);
-  const auto pad_index = node.getInputs().at(ir::operation::Pad::PAD);
-  const auto output_index = node.getOutputs().at(0);
-
-  const auto input_tensor = env->tensorAt(input_index);
-  const auto pad_tensor = env->tensorAt(pad_index);
-  const auto output_tensor = env->tensorAt(output_index);
-
-  const auto data_type = input_tensor->data_type();
-
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(input_tensor, pad_tensor, output_tensor);
-  }
-  else
-  {
-    throw std::runtime_error{"Interp(Pad): NYI - Unsupported data type"};
-  }
-}
-} // namespace
-
-OpKernel *getPad()
-{
-  static OpKernel kernel = {preparePad, invokePad};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Pool2D.cc b/runtime/onert/core/src/interp/operations/Pool2D.cc
deleted file mode 100644
index 3935d4756..000000000
--- a/runtime/onert/core/src/interp/operations/Pool2D.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Pool2D.h"
-#include "util/ShapeInference.h"
-#include "util/Utils.h"
-
-#include <cker/operation/AveragePool.h>
-#include <cker/operation/MaxPool.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace pool2d
-{
-
-void preparePool2D(ExecEnv *env, const ir::Operation &node)
-{
-  const auto &pool_node = nnfw::misc::polymorphic_downcast<const ir::operation::Pool2D &>(node);
-  const auto in_index = node.getInputs().at(pool_node.INPUT);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  UNUSED_RELEASE(in_tensor);
-
-  assert(in_tensor->getShape().rank() == 4);
-
-  const auto output_info = env->graph().operands().at(out_index).info();
-  if (output_info.total_size() == 0)
-  {
-    // Handle unspecified output shape
-    const auto infered_output_shape =
-      shape_inference::inferPoolShape(in_tensor->tensorInfo().shape(), pool_node.param());
-    env->allocateIfNeeded(
-      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
-  }
-  else
-  {
-    env->allocateIfNeeded(out_index, output_info);
-  }
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Handle same ifm & ofm data type only
-  assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->getShape().rank() == 4);
-}
-
-template <typename T>
-void invoke(const nnfw::cker::PoolParams &params, const nnfw::cker::Shape &in_shape,
-            const T *in_ptr, const nnfw::cker::Shape &out_shape, T *out_ptr,
-            ir::operation::Pool2D::PoolType op_type)
-{
-  switch (op_type)
-  {
-    case ir::operation::Pool2D::PoolType::AVG:
-      nnfw::cker::AveragePool<T>(params, in_shape, in_ptr, out_shape, out_ptr);
-      break;
-    case ir::operation::Pool2D::PoolType::MAX:
-      nnfw::cker::MaxPool<T>(params, in_shape, in_ptr, out_shape, out_ptr);
-      break;
-    default:
-      throw std::runtime_error{"Interp(Pool2D): NYI unsupported operation"};
-      break;
-  }
-}
-
-void invokePool2DOps(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &pool_node = nnfw::misc::polymorphic_downcast<const ir::operation::Pool2D &>(node);
-
-  const auto in_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  // Check lhs shape is same with rhs (with broadcast)
-  const auto in_tensor = env->tensorAt(in_index);
-  const auto out_tensor = env->tensorAt(out_index);
-
-  // TODO support NCHW frontend
-  const auto ifm_shape = in_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  const auto ofm_shape = out_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  const auto param = pool_node.param();
-  const auto padding =
-    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, param.kw, param.kh);
-  // Calculate
-  nnfw::cker::PoolParams cker_param;
-  cker_param.filter_width = param.kw;
-  cker_param.filter_height = param.kh;
-  cker_param.padding_values.width = padding.left;
-  cker_param.padding_values.height = padding.top;
-  cker_param.stride_width = param.stride.horizontal;
-  cker_param.stride_height = param.stride.vertical;
-
-  const auto data_type = in_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    calculateActivationRange(param.activation, &cker_param.float_activation_min,
-                             &cker_param.float_activation_max);
-
-    const auto in_shape = convertShape(in_tensor->tensorInfo().shape());
-    const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
-    const float *in_ptr = reinterpret_cast<const float *>(in_tensor->bufferRO());
-    float *out_ptr = reinterpret_cast<float *>(out_tensor->buffer());
-    // Now, invoke() supports only Pool2D in float
-    invoke<float>(cker_param, in_shape, in_ptr, out_shape, out_ptr, param.op_type);
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float only"};
-  }
-}
-} // namespace pool2d
-
-OpKernel *getPool2D()
-{
-  static OpKernel kernel = {pool2d::preparePool2D, pool2d::invokePool2DOps};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Reshape.cc b/runtime/onert/core/src/interp/operations/Reshape.cc
deleted file mode 100644
index 1de5a5762..000000000
--- a/runtime/onert/core/src/interp/operations/Reshape.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../Registration.h"
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-void prepare(ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  // Unspecified shape is not supported in operation node spec now
-  const auto output_info = env->graph().operands().at(out_index).info();
-  env->allocateAndShareIfNeeded(out_index, output_info, in_index);
-
-  assert(output_info.total_size() == env->graph().operands().at(in_index).info().total_size());
-}
-
-void invoke(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  if (env->tensorAt(in_index)->bufferRO() == env->tensorAt(out_index)->bufferRO())
-  {
-    // Same data
-    return;
-  }
-
-  const auto output_info = env->graph().operands().at(out_index).info();
-  memcpy(env->tensorAt(out_index)->buffer(), env->tensorAt(in_index)->bufferRO(),
-         output_info.total_size());
-}
-
-} // namespace
-
-OpKernel *getReshape()
-{
-  static OpKernel kernel = {prepare, invoke};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/Softmax.cc b/runtime/onert/core/src/interp/operations/Softmax.cc
deleted file mode 100644
index 8be2f2210..000000000
--- a/runtime/onert/core/src/interp/operations/Softmax.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/Softmax.h"
-
-#include <cker/operation/SoftMax.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-void prepareSoftMax(ExecEnv *env, const ir::Operation &node)
-{
-  const auto in_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  UNUSED_RELEASE(in_tensor);
-
-  assert((in_tensor->getShape().rank() == 4) || (in_tensor->getShape().rank() == 2));
-
-  // Output shape should be same with input
-  // Output type is pre-defined in model
-  const auto output_shape = env->graph().operands().at(in_index).info().shape();
-  const auto output_type = env->graph().operands().at(out_index).info().typeInfo();
-
-  const auto output_info = ir::OperandInfo::createStaticInfo(output_shape, output_type);
-  env->allocateIfNeeded(out_index, output_info);
-
-  auto out_tensor = env->tensorAt(out_index);
-  UNUSED_RELEASE(out_tensor);
-
-  // Check output shape is same with input
-  assert(out_tensor->getShape().rank() == out_tensor->getShape().rank());
-  for (int32_t i = 0; i < in_tensor->getShape().rank(); i++)
-  {
-    assert(in_tensor->getShape().dim(i) == out_tensor->getShape().dim(i));
-  }
-}
-
-void invoke(const ITensor *in_tensor, const ITensor *out_tensor,
-            const ir::operation::Softmax::Param &param)
-{
-  const float *in_ptr = reinterpret_cast<const float *>(in_tensor->bufferRO());
-  float *out_ptr = reinterpret_cast<float *>(out_tensor->buffer());
-
-  float beta = param.beta;
-
-  if (in_tensor->getShape().rank() == 2)
-  {
-    uint32_t batch_size = in_tensor->getShape().dim(0);
-    uint32_t input_size = in_tensor->getShape().dim(1);
-
-    nnfw::cker::Softmax(in_ptr, input_size, batch_size, beta, out_ptr);
-  }
-  else if (in_tensor->getShape().rank() == 4)
-  {
-    const auto in_shape = convertShape(in_tensor->tensorInfo().shape());
-    const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
-
-    nnfw::cker::SoftmaxParams cker_param;
-    cker_param.beta = beta;
-
-    nnfw::cker::Softmax(cker_param, in_shape, in_ptr, out_shape, out_ptr);
-  }
-  else
-  {
-    throw std::runtime_error{"Unsuported input dimension: support 2D or 4D"};
-  }
-}
-
-void invokeSoftMax(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &softmax_node = nnfw::misc::polymorphic_downcast<const ir::operation::Softmax &>(node);
-
-  const auto in_index = node.getInputs().at(0);
-  const auto out_index = node.getOutputs().at(0);
-
-  const auto in_tensor = env->tensorAt(in_index);
-  const auto out_tensor = env->tensorAt(out_index);
-
-  const auto in_data_type = in_tensor->data_type();
-  const auto out_data_type = out_tensor->data_type();
-  if ((in_data_type == ir::DataType::FLOAT32) && (out_data_type == ir::DataType::FLOAT32))
-  {
-    invoke(in_tensor, out_tensor, softmax_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"NYI: Support float32 only"};
-  }
-}
-
-} // namespace
-
-OpKernel *getSoftmax()
-{
-  static OpKernel kernel = {prepareSoftMax, invokeSoftMax};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/interp/operations/TransposeConv.cc b/runtime/onert/core/src/interp/operations/TransposeConv.cc
deleted file mode 100644
index 59c8e8cdf..000000000
--- a/runtime/onert/core/src/interp/operations/TransposeConv.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OperationUtil.h"
-#include "../Registration.h"
-
-#include "ir/operation/TransposeConv.h"
-
-#include <cker/operation/TransposeConv.h>
-#include <misc/polymorphic_downcast.h>
-
-namespace onert
-{
-namespace interp
-{
-namespace
-{
-
-void prepareTransposeConv(ExecEnv *env, const ir::Operation &node)
-{
-  const auto ifm_index = node.getInputs().at(ir::operation::TransposeConv::INPUT);
-  const auto ker_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL);
-  const auto ofm_shape_index = node.getInputs().at(ir::operation::TransposeConv::OUTPUT_SHAPE);
-  const auto ofm_index = node.getOutputs().at(0);
-
-  const auto ifm_tensor = env->tensorAt(ifm_index);
-  const auto ker_tensor = env->tensorAt(ker_index);
-  const auto ofm_shape_tensor = env->tensorAt(ofm_shape_index);
-
-  assert(ifm_tensor->getShape().rank() == 4);
-  assert(ker_tensor->getShape().rank() == 4);
-  assert(ofm_shape_tensor->getShape().rank() == 1);
-
-  UNUSED_RELEASE(ifm_tensor);
-  UNUSED_RELEASE(ker_tensor);
-  UNUSED_RELEASE(ofm_shape_tensor);
-
-  const auto output_info = env->graph().operands().at(ofm_index).info();
-  if (output_info.total_size() == 0)
-  {
-    // TODO: Handle unspecified output shape
-    throw std::runtime_error{"Interp(TConv): NYI unspecified output shape"};
-  }
-  else
-  {
-    env->allocateIfNeeded(ofm_index, output_info);
-  }
-
-  auto ofm_tensor = env->tensorAt(ofm_index);
-  UNUSED_RELEASE(ofm_tensor);
-
-  // Handle same ifm & ofm data type only
-  if (ifm_tensor->data_type() != ofm_tensor->data_type())
-  {
-    throw std::runtime_error{"Interp(TConv): Different I/O data dype"};
-  }
-
-  if (ofm_tensor->getShape().rank() != 4)
-  {
-    throw std::runtime_error{"Interp(TConv): Invalid output rank"};
-  }
-}
-
-void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *ofm_tensor,
-            const ir::operation::TransposeConv::Param &param)
-{
-  const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
-  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
-  const auto ker_shape = ker_tensor->tensorInfo().shape();
-  const auto ker_height = ker_shape.dim(1);
-  const auto ker_width = ker_shape.dim(2);
-  const auto padding =
-    ir::calculatePadding(param.padding, ofm_shape, ifm_shape, param.stride, ker_width, ker_height);
-
-  nnfw::cker::TransposeConvParams cker_param;
-  cker_param.padding_values.width = padding.left;
-  cker_param.padding_values.height = padding.top;
-  cker_param.stride_width = param.stride.horizontal;
-  cker_param.stride_height = param.stride.vertical;
-  cker_param.dilation_width_factor = 1;
-  cker_param.dilation_height_factor = 1;
-
-  const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape());
-  const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape());
-  const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape());
-  const float *ifm_ptr = reinterpret_cast<const float *>(ifm_tensor->bufferRO());
-  const float *ker_ptr = reinterpret_cast<const float *>(ker_tensor->bufferRO());
-  float *ofm_ptr = reinterpret_cast<float *>(ofm_tensor->buffer());
-
-  nnfw::cker::TransposeConv(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr,
-                            cker_ofm_shape, ofm_ptr);
-}
-
-void invokeTransposeConv(const ExecEnv *env, const ir::Operation &node)
-{
-  const auto &tconv_node =
-    nnfw::misc::polymorphic_downcast<const ir::operation::TransposeConv &>(node);
-
-  const auto ifm_index = node.getInputs().at(ir::operation::TransposeConv::INPUT);
-  const auto ker_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL);
-  const auto ofm_index = node.getOutputs().at(0);
-
-  const auto ifm_tensor = env->tensorAt(ifm_index);
-  const auto ker_tensor = env->tensorAt(ker_index);
-  const auto ofm_tensor = env->tensorAt(ofm_index);
-
-  const auto data_type = ifm_tensor->data_type();
-  if (data_type == ir::DataType::FLOAT32)
-  {
-    invoke(ifm_tensor, ker_tensor, ofm_tensor, tconv_node.param());
-  }
-  else
-  {
-    throw std::runtime_error{"Interp(TConv): Support float32 only"};
-  }
-}
-
-} // namespace
-
-OpKernel *getTransposeConv()
-{
-  static OpKernel kernel = {prepareTransposeConv, invokeTransposeConv};
-  return &kernel;
-}
-
-} // namespace interp
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/Shape.cc b/runtime/onert/core/src/ir/Shape.cc
index a7c50a266..e4e4c154b 100644
--- a/runtime/onert/core/src/ir/Shape.cc
+++ b/runtime/onert/core/src/ir/Shape.cc
@@ -26,10 +26,10 @@ namespace onert
 namespace ir
 {
 
-int32_t const Shape::UNSPECIFIED_DIM = -1;
+int32_t const Shape::kUnspecifiedDim = -1;
 
 // NNFW_MAX_RANK is 6
-int32_t const Shape::MAX_RANK = 6;
+int32_t const Shape::kMaxRank = 6;
 
 FeatureShape Shape::asFeature(Layout layout) const
 {
@@ -80,7 +80,7 @@ uint64_t Shape::num_elements() const
 {
   // if dimension is 0, it means unspecified and cannot calculate the total number of elements
   if (std::any_of(_dimensions.begin(), _dimensions.end(),
-                  [](const int32_t &v) { return v == UNSPECIFIED_DIM; }))
+                  [](const int32_t &v) { return v == kUnspecifiedDim; }))
     throw std::runtime_error("num_elements() cannot calculate when any dimension is unspecified");
 
   return std::accumulate(_dimensions.cbegin(), _dimensions.cend(), UINT64_C(1),
@@ -89,7 +89,7 @@ uint64_t Shape::num_elements() const
 
 Shape permuteShape(const Shape &shape, Layout from, Layout to)
 {
-  assert(shape.rank() <= Shape::MAX_RANK);
+  assert(shape.rank() <= Shape::kMaxRank);
   Shape ret{shape};
   if (from == to)
     return ret;
diff --git a/runtime/onert/core/src/ir/Shape.test.cc b/runtime/onert/core/src/ir/Shape.test.cc
index afdb29254..4788522d3 100644
--- a/runtime/onert/core/src/ir/Shape.test.cc
+++ b/runtime/onert/core/src/ir/Shape.test.cc
@@ -48,7 +48,7 @@ TEST(ShapeTest, neg_basic_test)
     onert::ir::Shape shape(2);
 
     shape.dim(0) = 1;
-    shape.dim(1) = onert::ir::Shape::UNSPECIFIED_DIM;
+    shape.dim(1) = onert::ir::Shape::kUnspecifiedDim;
 
     ASSERT_EQ(shape.rank(), 2);
     ASSERT_EQ(onert::ir::rankMaybeUnspecified(shape), false);
diff --git a/runtime/onert/core/src/util/ChromeTracingEventWriter.cc b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc
index d868efedf..c3f5179df 100644
--- a/runtime/onert/core/src/util/ChromeTracingEventWriter.cc
+++ b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc
@@ -168,7 +168,7 @@ void ChromeTracingWriter::flush(const std::vector<std::unique_ptr<EventRecorder>
   _os << "{\n";
   _os << "  " << quote("traceEvents") << ": [\n";
 
-  for (auto &recorder : recorders)
+  for (const auto &recorder : recorders)
   {
     flushOneRecord(*recorder);
   }
@@ -180,7 +180,7 @@ void ChromeTracingWriter::flush(const std::vector<std::unique_ptr<EventRecorder>
 
 void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder)
 {
-  for (auto &evt : recorder.duration_events())
+  for (const auto &evt : recorder.duration_events())
   {
     const std::string name = getLabel(*evt);
     const std::string tid = getTid(*evt);
@@ -188,7 +188,7 @@ void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder)
     _os << "    " << object(*evt, name, tid) << ",\n";
   }
 
-  for (auto &evt : recorder.counter_events())
+  for (const auto &evt : recorder.counter_events())
   {
     _os << "    " << object(evt) << ",\n";
   }
diff --git a/runtime/onert/core/src/util/MDTableEventWriter.cc b/runtime/onert/core/src/util/MDTableEventWriter.cc
index 7a8b9f234..13dab5b77 100644
--- a/runtime/onert/core/src/util/MDTableEventWriter.cc
+++ b/runtime/onert/core/src/util/MDTableEventWriter.cc
@@ -32,7 +32,7 @@ namespace
 void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
 {
   os << "| ";
-  for (auto &key : list)
+  for (const auto &key : list)
   {
     os << key << " | ";
   }
@@ -227,7 +227,7 @@ struct MDTableBuilder
 
   MDTableBuilder &build()
   {
-    for (auto &it : divideGraph())
+    for (const auto &it : divideGraph())
     {
       size_t begin_idx = it.first;
       size_t end_idx = it.second;
@@ -314,7 +314,7 @@ struct MDTableBuilder
     graph.end_ts = std::stoull(_duration_events[end_idx]->ts);
     graph.setOperations(name_to_op);
 
-    for (auto &arg : _duration_events[end_idx]->args)
+    for (const auto &arg : _duration_events[end_idx]->args)
     {
       if (arg.first == "session")
         graph.session_index = arg.second;
@@ -358,7 +358,7 @@ struct MDTableBuilder
 
 void MDTableWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &records)
 {
-  for (auto &recorder : records)
+  for (const auto &recorder : records)
   {
     MDTableBuilder(recorder->duration_events(), recorder->counter_events()).build().write(_os);
   }
diff --git a/runtime/onert/core/src/util/SNPEEventWriter.cc b/runtime/onert/core/src/util/SNPEEventWriter.cc
index 4dea6d16c..87bbfc662 100644
--- a/runtime/onert/core/src/util/SNPEEventWriter.cc
+++ b/runtime/onert/core/src/util/SNPEEventWriter.cc
@@ -103,9 +103,9 @@ void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &record
   // Memory
   {
     std::unordered_map<std::string, Stat> mem_stats;
-    for (auto &recorder : recorders)
+    for (const auto &recorder : recorders)
     {
-      for (auto &evt : recorder->counter_events())
+      for (const auto &evt : recorder->counter_events())
       {
         auto &mem_stat = mem_stats[evt.name];
         uint64_t val = std::stoull(evt.values.at("value"));
@@ -114,7 +114,7 @@ void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &record
     }
 
     auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
-    for (auto &kv : mem_stats)
+    for (const auto &kv : mem_stats)
     {
       auto &key = kv.first;
       auto &val = kv.second;
@@ -132,9 +132,9 @@ void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &record
     // 2D keys : stats[tid][name]
     std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
     std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
-    for (auto &recorder : recorders)
+    for (const auto &recorder : recorders)
     {
-      for (auto &evt : recorder->duration_events())
+      for (const auto &evt : recorder->duration_events())
       {
         std::string evt_name = getLabel(*evt);
         std::string evt_tid = getBackend(*evt);
@@ -160,17 +160,17 @@ void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &record
       }
     }
 
-    for (auto &kv : begin_timestamps)
-      for (auto &kv2 : kv.second)
+    for (const auto &kv : begin_timestamps)
+      for (const auto &kv2 : kv.second)
         if (kv2.second != 0)
           throw std::runtime_error{"Invalid Data - B and E pair does not match."};
 
-    for (auto &kv : stats)
+    for (const auto &kv : stats)
     {
-      auto &tid = kv.first;
-      auto &map = kv.second;
+      const auto &tid = kv.first;
+      const auto &map = kv.second;
       auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
-      for (auto &kv : map)
+      for (const auto &kv : map)
       {
         auto &name = kv.first;
         auto &val = kv.second;
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index 173de29c7..862d6f725 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -608,12 +608,12 @@ ir::Shape inferReshapeShape(const int32_t *shape_buf, const int32_t shape_num_el
                             const size_t total_num_elements)
 {
   ir::Shape ret(shape_num_elements);
-  int32_t flatten_dim = ir::Shape::UNSPECIFIED_DIM;
+  int32_t flatten_dim = ir::Shape::kUnspecifiedDim;
   for (int32_t i = 0; i < shape_num_elements; ++i)
   {
     if (shape_buf[i] < 0)
     {
-      if (flatten_dim != ir::Shape::UNSPECIFIED_DIM)
+      if (flatten_dim != ir::Shape::kUnspecifiedDim)
         throw std::runtime_error("Reshape: 2nd param has special dim(for flatten) more than twice");
       flatten_dim = i;
       ret.dim(i) = 1;
@@ -623,7 +623,7 @@ ir::Shape inferReshapeShape(const int32_t *shape_buf, const int32_t shape_num_el
       ret.dim(i) = shape_buf[i];
     }
   }
-  if (flatten_dim != ir::Shape::UNSPECIFIED_DIM)
+  if (flatten_dim != ir::Shape::kUnspecifiedDim)
     ret.dim(flatten_dim) = total_num_elements / ret.num_elements();
 
   // Check reshapable
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index cf080abbc..878a594cc 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -68,8 +68,7 @@ public:
    * @param model reference to model
    */
   explicit BaseLoader(std::unique_ptr<ir::Model> &model)
-    : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _model(model), _domain_model{nullptr},
-      _tensor_names(std::make_shared<std::unordered_map<ir::OperandIndex, std::string>>())
+    : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _model(model), _domain_model{nullptr}
   {
     _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA);
   }
@@ -194,7 +193,7 @@ protected:
   const Model *_domain_model;
   // Maps Tensor indices to onert Operands.
   std::vector<ir::OperandIndex> _tensor_to_operand;
-  std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> _tensor_names;
+  std::unordered_map<ir::OperandIndex, std::string> _tensor_names;
   // Verifier
   std::unique_ptr<Verifier> _verifier;
   // Boolean flag to use MMAPED_DATA
@@ -411,7 +410,7 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
     subg.setOperandValue(operand_index, std::move(data_obj));
   }
 
-  _tensor_names->emplace(operand_index, tensor->name()->str());
+  _tensor_names.emplace(operand_index, tensor->name()->str());
 
   // Variable
   if (tensor->is_variable())
@@ -1297,8 +1296,8 @@ void BaseLoader<LoaderDomain>::loadIf(const Operator *op, ir::Graph &subg)
   verifySubgraphIndex(else_index);
 
   ir::operation::If::Param param;
-  param.then_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(then_index)};
-  param.else_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(else_index)};
+  param.then_subg_index = ir::SubgraphIndex{static_cast<uint16_t>(then_index)};
+  param.else_subg_index = ir::SubgraphIndex{static_cast<uint16_t>(else_index)};
 
   loadOperationTo<ir::operation::If>(op, subg, param);
 }
@@ -1314,8 +1313,8 @@ void BaseLoader<LoaderDomain>::loadWhile(const Operator *op, ir::Graph &subg)
   verifySubgraphIndex(body_index);
 
   ir::operation::While::Param param;
-  param.cond_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(cond_index)};
-  param.body_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(body_index)};
+  param.cond_subg_index = ir::SubgraphIndex{static_cast<uint16_t>(cond_index)};
+  param.body_subg_index = ir::SubgraphIndex{static_cast<uint16_t>(body_index)};
 
   loadOperationTo<ir::operation::While>(op, subg, param);
 }
@@ -1663,6 +1662,12 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
     case BuiltinOperator::BuiltinOperator_DEPTH_TO_SPACE:
       loadDepthToSpace(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_EMBEDDING_LOOKUP:
+      loadOperationTo<ir::operation::EmbeddingLookup>(op, subg);
+      return;
+    case BuiltinOperator::BuiltinOperator_HASHTABLE_LOOKUP:
+      loadOperationTo<ir::operation::HashtableLookup>(op, subg);
+      return;
     default:
       throw std::runtime_error(
         std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
@@ -1682,10 +1687,15 @@ template <typename LoaderDomain> void BaseLoader<LoaderDomain>::loadModel()
   // Load subgraphs and map operations on subgraph
   const auto subgraphs = _domain_model->subgraphs();
   auto model = std::make_unique<ir::Model>();
-  for (uint32_t subgraph_index = 0; subgraph_index < subgraphs->size(); ++subgraph_index)
+  if (subgraphs->size() - 1 > ir::SubgraphIndex::max())
+    throw std::runtime_error{"The number of subgraphs cannot exceed " +
+                             std::to_string(ir::SubgraphIndex::max() + 1)};
+  for (uint16_t subgraph_index = 0; subgraph_index < subgraphs->size(); ++subgraph_index)
   {
     auto subg = loadSubgraph((*_domain_model->subgraphs())[subgraph_index]);
-    model->push(ir::SubgraphIndex{subgraph_index}, std::move(subg));
+    // NOTE: Used () instead of {}, which does not check narrowing.
+    // It is okay since overflow is checked the above if-statement.
+    model->push(ir::SubgraphIndex(subgraph_index), std::move(subg));
   }
   _model = std::move(model);
 }
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 5abcc9cd0..5bf626d6c 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -112,13 +112,13 @@ private:
     for (const std::int32_t input_ind : *circle_subg->inputs())
     {
       subg->addInput(tensorIdxToOperandIdx(input_ind),
-                     _tensor_names->at(_tensor_to_operand[input_ind]));
+                     _tensor_names.at(_tensor_to_operand[input_ind]));
     }
     // Set outputs
     for (const std::int32_t output_ind : *circle_subg->outputs())
     {
       subg->addOutput(tensorIdxToOperandIdx(output_ind),
-                      _tensor_names->at(_tensor_to_operand[output_ind]));
+                      _tensor_names.at(_tensor_to_operand[output_ind]));
     }
     // Create operations
     for (const auto *op : *circle_subg->operators())
diff --git a/runtime/onert/frontend/nnapi/CMakeLists.txt b/runtime/onert/frontend/nnapi/CMakeLists.txt
index dafd84ccf..b66b32e89 100644
--- a/runtime/onert/frontend/nnapi/CMakeLists.txt
+++ b/runtime/onert/frontend/nnapi/CMakeLists.txt
@@ -24,4 +24,4 @@ target_link_libraries(test_onert_frontend_nnapi PRIVATE ${LIB_ONERT} dl)
 target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest)
 target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest_main)
 
-install(TARGETS test_onert_frontend_nnapi DESTINATION unittest_standalone)
+install(TARGETS test_onert_frontend_nnapi DESTINATION unittest)
diff --git a/runtime/onert/frontend/nnapi/compilation.cc b/runtime/onert/frontend/nnapi/compilation.cc
index 871c040ef..2c56f061a 100644
--- a/runtime/onert/frontend/nnapi/compilation.cc
+++ b/runtime/onert/frontend/nnapi/compilation.cc
@@ -58,7 +58,7 @@ int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation *compilation)
     return ANEURALNETWORKS_UNEXPECTED_NULL;
   }
 
-  if (compilation->state() != ::onert::compiler::State::CREATED)
+  if (compilation->isFinished())
   {
     VERBOSE(NNAPI::Compilation) << "finish: Already finished" << std::endl;
     return ANEURALNETWORKS_BAD_STATE;
@@ -87,7 +87,7 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation *compila
     return ANEURALNETWORKS_UNEXPECTED_NULL;
   }
 
-  if (compilation->state() != ::onert::compiler::State::CREATED)
+  if (compilation->isFinished())
   {
     VERBOSE(NNAPI::Compilation) << "setPreference: Already finished" << std::endl;
     return ANEURALNETWORKS_BAD_STATE;
diff --git a/runtime/onert/frontend/nnapi/execution.cc b/runtime/onert/frontend/nnapi/execution.cc
index 19636a84d..4e1a985f3 100644
--- a/runtime/onert/frontend/nnapi/execution.cc
+++ b/runtime/onert/frontend/nnapi/execution.cc
@@ -37,7 +37,7 @@ int ANeuralNetworksExecution_create(ANeuralNetworksCompilation *compilation,
     return ANEURALNETWORKS_UNEXPECTED_NULL;
   }
 
-  std::shared_ptr<onert::exec::Executors> executors;
+  std::shared_ptr<onert::exec::IExecutors> executors;
 
   compilation->publish(executors);
 
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
index bb247b97f..3b5edc180 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc
@@ -26,9 +26,7 @@ ANeuralNetworksCompilation::ANeuralNetworksCompilation(const ANeuralNetworksMode
     _compiler{std::make_shared<compiler::Compiler>(_model, *_coptions)}
 {
   if (model->allowedToFp16())
-  {
-    _compiler->enableToFp16();
-  }
+    _coptions->enableToFp16();
 }
 
 bool ANeuralNetworksCompilation::finish() noexcept
@@ -36,6 +34,7 @@ bool ANeuralNetworksCompilation::finish() noexcept
   try
   {
     _artifact = _compiler->compile();
+    _compiler = nullptr;
   }
   catch (const std::exception &e)
   {
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
index dff5c6dc6..3898f1d5e 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h
@@ -22,7 +22,7 @@
 #include "compiler/Compiler.h"
 #include "ir/Graph.h"
 #include "ir/Model.h"
-#include "exec/Executors.h"
+#include "exec/IExecutors.h"
 #include "util/TracingCtx.h"
 
 struct ANeuralNetworksCompilation
@@ -32,9 +32,9 @@ public:
 
 public:
   bool finish() noexcept;
+  bool isFinished() noexcept { return _compiler == nullptr; }
 
-  onert::compiler::State state(void) noexcept { return _compiler->state(); }
-  void publish(std::shared_ptr<onert::exec::Executors> &executors) noexcept
+  void publish(std::shared_ptr<onert::exec::IExecutors> &executors) noexcept
   {
     executors = _artifact ? _artifact->_executors : nullptr;
   }
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
index 110c7cd55..6fbc4c2e0 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h
@@ -26,7 +26,7 @@
 struct ANeuralNetworksExecution
 {
 public:
-  ANeuralNetworksExecution(const std::shared_ptr<onert::exec::Executors> &executors)
+  ANeuralNetworksExecution(const std::shared_ptr<onert::exec::IExecutors> &executors)
     : _execution{std::make_shared<onert::exec::Execution>(executors)}
   {
     // DO NOTHING
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index fe69e4e2a..dc8564632 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -99,13 +99,13 @@ private:
     for (const std::int32_t input_ind : *tflite_subg->inputs())
     {
       subg->addInput(tensorIdxToOperandIdx(input_ind),
-                     _tensor_names->at(_tensor_to_operand[input_ind]));
+                     _tensor_names.at(_tensor_to_operand[input_ind]));
     }
     // Set outputs
     for (const std::int32_t output_ind : *tflite_subg->outputs())
     {
       subg->addOutput(tensorIdxToOperandIdx(output_ind),
-                      _tensor_names->at(_tensor_to_operand[output_ind]));
+                      _tensor_names.at(_tensor_to_operand[output_ind]));
     }
     // Create operations
     for (const auto *op : *tflite_subg->operators())
@@ -113,7 +113,6 @@ private:
       loadOperation(op, *subg);
     }
 
-    subg->setTensorName(_tensor_names);
     subg->verify();
 
     return subg;
-- 
cgit v1.2.3