Imported Upstream version 1.21.0upstream/1.21.0 tizen_7.0_m2_release accepted/tizen/unified/20220912.170817 accepted/tizen/unified/20220912.164738 accepted/tizen/7.0/unified/hotfix/20221116.105341 accepted/tizen/7.0/unified/20221110.060236 tizen_7.0_hotfix tizen_7.0 accepted/tizen_7.0_unified_hotfix accepted/tizen_7.0_unified

author: Chunseok Lee <chunseok.lee@samsung.com> 2022-09-07 19:04:21 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2022-09-07 19:04:21 +0900
commit: c690d52bdd137ed6a17353aa7af35e8141ece77b (patch)
tree: dbb7dd99133132dfbffcb8c9e9af4f1ffc2f4808 /compiler
parent: 3ad689f0803519e343c36d5700646e86059df961 (diff)
download: nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.tar.gz
nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.tar.bz2
nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.zip
1301 files changed, 60076 insertions, 3238 deletions
diff --git a/compiler/arser/include/arser/arser.h b/compiler/arser/include/arser/arser.h
index 1703e421e..43f99dc5e 100644
--- a/compiler/arser/include/arser/arser.h
+++ b/compiler/arser/include/arser/arser.h
@@ -303,7 +303,7 @@ private:
   std::string _long_name;
   std::string _short_name;
   std::vector<std::string> _names;
-  std::string _type;
+  std::string _type = "string";
   std::string _help_message;
   std::function<void(void)> _func;
   uint32_t _nargs{1};
@@ -540,16 +540,20 @@ public:
     /*
     ** print usage
     */
+    auto print_usage_arg = [&](const arser::Argument &arg) {
+      stream << " ";
+      std::string arg_name = arser::internal::remove_dash(arg._long_name);
+      std::for_each(arg_name.begin(), arg_name.end(),
+                    [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
+    };
     stream << "Usage: ./" << parser._program_name << " ";
     // required optional argument
     for (const auto &arg : parser._optional_arg_vec)
     {
       if (!arg._is_required)
         continue;
-      stream << arg._short_name << " ";
-      std::string arg_name = arser::internal::remove_dash(arg._long_name);
-      std::for_each(arg_name.begin(), arg_name.end(),
-                    [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
+      stream << arg._short_name;
+      print_usage_arg(arg);
       stream << " ";
     }
     // rest of the optional argument
@@ -560,10 +564,7 @@ public:
       stream << "[" << arg._short_name;
       if (arg._nargs)
       {
-        stream << " ";
-        std::string arg_name = arser::internal::remove_dash(arg._long_name);
-        std::for_each(arg_name.begin(), arg_name.end(),
-                      [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
+        print_usage_arg(arg);
       }
       stream << "]"
              << " ";
@@ -591,39 +592,28 @@ public:
     }
 
     const size_t message_width = 60;
-    // positional argument
-    if (!parser._positional_arg_vec.empty())
-    {
-      stream << "[Positional argument]" << std::endl;
-      for (const auto &arg : parser._positional_arg_vec)
+    auto print_help_args = [&](const std::list<Argument> &args, const std::string &title) {
+      if (!args.empty())
       {
-        stream.width(length_of_longest_arg);
-        stream << std::left << arser::internal::make_comma_concatenated(arg._names) << "\t";
-        for (size_t i = 0; i < arg._help_message.length(); i += message_width)
+        stream << title << std::endl;
+        for (const auto &arg : args)
         {
-          if (i)
-            stream << std::string(length_of_longest_arg, ' ') << "\t";
-          stream << arg._help_message.substr(i, message_width) << std::endl;
+          stream.width(length_of_longest_arg);
+          stream << std::left << arser::internal::make_comma_concatenated(arg._names) << "\t";
+          for (size_t i = 0; i < arg._help_message.length(); i += message_width)
+          {
+            if (i)
+              stream << std::string(length_of_longest_arg, ' ') << "\t";
+            stream << arg._help_message.substr(i, message_width) << std::endl;
+          }
         }
+        std::cout << std::endl;
       }
-      std::cout << std::endl;
-    }
+    };
+    // positional argument
+    print_help_args(parser._positional_arg_vec, "[Positional argument]");
     // optional argument
-    if (!parser._optional_arg_vec.empty())
-    {
-      stream << "[Optional argument]" << std::endl;
-      for (const auto &arg : parser._optional_arg_vec)
-      {
-        stream.width(length_of_longest_arg);
-        stream << std::left << arser::internal::make_comma_concatenated(arg._names) << "\t";
-        for (size_t i = 0; i < arg._help_message.length(); i += message_width)
-        {
-          if (i)
-            stream << std::string(length_of_longest_arg, ' ') << "\t";
-          stream << arg._help_message.substr(i, message_width) << std::endl;
-        }
-      }
-    }
+    print_help_args(parser._optional_arg_vec, "[Optional argument]");
 
     return stream;
   }
@@ -737,6 +727,29 @@ template <typename T> T Arser::get(const std::string &arg_name)
   return get_impl(arg_name, static_cast<T *>(nullptr));
 }
 
+class Helper
+{
+public:
+  static void add_version(Arser &arser, const std::function<void(void)> &func)
+  {
+    arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(func);
+  }
+
+  static void add_verbose(Arser &arser)
+  {
+    arser.add_argument("-V", "--verbose")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("output additional information to stdout or stderr");
+  }
+};
+
 } // namespace arser
 
 #endif // __ARSER_H__
diff --git a/compiler/circle-eval-diff/CMakeLists.txt b/compiler/circle-eval-diff/CMakeLists.txt
index 4d86f8097..d5a62301c 100644
--- a/compiler/circle-eval-diff/CMakeLists.txt
+++ b/compiler/circle-eval-diff/CMakeLists.txt
@@ -6,6 +6,7 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 
 add_executable(circle-eval-diff ${DRIVER} ${SOURCES})
 target_include_directories(circle-eval-diff PRIVATE include)
+target_include_directories(circle-eval-diff PRIVATE src)
 
 target_link_libraries(circle-eval-diff arser)
 target_link_libraries(circle-eval-diff safemain)
@@ -17,6 +18,8 @@ target_link_libraries(circle-eval-diff luci_interpreter)
 target_link_libraries(circle-eval-diff dio_hdf5)
 target_link_libraries(circle-eval-diff vconone)
 
+install(TARGETS circle-eval-diff DESTINATION bin)
+
 if(NOT ENABLE_TEST)
   return()
 endif(NOT ENABLE_TEST)
@@ -25,10 +28,15 @@ endif(NOT ENABLE_TEST)
 # Instead, we use TEST_SOURCES to specify sources uesd for tests.
 set(TEST_SOURCES
     "src/MetricPrinter.cpp"
-    "src/Tensor.cpp")
+    "src/Tensor.cpp"
+    "src/InputDataLoader.cpp")
 
 nnas_find_package(GTest REQUIRED)
 GTest_AddTest(circle_eval_diff_test ${TESTS} ${TEST_SOURCES})
+target_include_directories(circle_eval_diff_test PRIVATE include)
 target_include_directories(circle_eval_diff_test PRIVATE src)
 target_link_libraries(circle_eval_diff_test luci_testhelper)
 target_link_libraries(circle_eval_diff_test nncc_coverage)
+target_link_libraries(circle_eval_diff_test dio_hdf5)
+target_link_libraries(circle_eval_diff_test loco)
+target_link_libraries(circle_eval_diff_test luci_lang)
diff --git a/compiler/circle-eval-diff/driver/Driver.cpp b/compiler/circle-eval-diff/driver/Driver.cpp
index f4a12a403..7e63ec88c 100644
--- a/compiler/circle-eval-diff/driver/Driver.cpp
+++ b/compiler/circle-eval-diff/driver/Driver.cpp
@@ -30,19 +30,15 @@ std::string to_lower_case(std::string s)
   return s;
 }
 
-Metric to_metric(const std::string &str)
-{
-  if (to_lower_case(str).compare("mae") == 0)
-    return Metric::MAE;
-
-  throw std::runtime_error("Unsupported metric.");
-}
-
 InputFormat to_input_format(const std::string &str)
 {
-  if (to_lower_case(str).compare("h5") == 0)
+  auto small_str = to_lower_case(str);
+  if (small_str.compare("h5") == 0)
     return InputFormat::H5;
 
+  if (small_str.compare("directory") == 0 || small_str.compare("dir") == 0)
+    return InputFormat::DIR;
+
   throw std::runtime_error("Unsupported input format.");
 }
 
@@ -58,50 +54,50 @@ int entry(const int argc, char **argv)
 {
   arser::Arser arser("Compare inference results of two circle models");
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
+  arser::Helper::add_version(arser, print_version);
 
-  arser.add_argument("--first_model")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(true)
-    .help("First input model filepath");
+  arser.add_argument("--first_model").required(true).help("First input model filepath");
 
-  arser.add_argument("--second_model")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(true)
-    .help("Second input model filepath");
+  arser.add_argument("--second_model").required(true).help("Second input model filepath");
 
   arser.add_argument("--first_input_data")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .help("Input data filepath for the first model. If not given, circle-eval-diff will run with "
           "randomly generated data");
 
   arser.add_argument("--second_input_data")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .help("Input data filepath for the second model. If not given, circle-eval-diff will run with "
           "randomly generated data");
 
-  arser.add_argument("--metric")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .default_value("MAE")
-    .help("Metric for comparison (default: MAE)");
+  arser.add_argument("--dump_output_with_prefix")
+    .help("Dump output to files. <prefix> should be given as an argument. "
+          "Outputs are saved in <prefix>.<data_index>.first.output<output_index> and "
+          "<prefix>.<data_index>.second.output<output_index>.");
+
+  arser.add_argument("--print_mae").nargs(0).default_value(false).help("Print Mean Absolute Error");
+
+  arser.add_argument("--print_mape")
+    .nargs(0)
+    .default_value(false)
+    .help("Print Mean Absolute PercentageError");
+
+  arser.add_argument("--print_mpeir")
+    .nargs(0)
+    .default_value(false)
+    .help("Print Mean Peak Error to Interval Ratio");
+
+  arser.add_argument("--print_top1_match")
+    .nargs(0)
+    .default_value(false)
+    .help("Print Mean Top-1 Match Ratio");
+
+  arser.add_argument("--print_top5_match")
+    .nargs(0)
+    .default_value(false)
+    .help("Print Mean Top-5 Match Ratio");
+
+  arser.add_argument("--print_mse").nargs(0).default_value(false).help("Print Mean Squared Error");
 
   arser.add_argument("--input_data_format")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .default_value("h5")
     .help("Input data format. h5/hdf5 (default) or directory");
 
@@ -124,6 +120,7 @@ int entry(const int argc, char **argv)
   std::string second_input_data_path;
   std::string metric;
   std::string input_data_format;
+  std::string output_prefix;
 
   if (arser["--first_input_data"])
     first_input_data_path = arser.get<std::string>("--first_input_data");
@@ -135,22 +132,54 @@ int entry(const int argc, char **argv)
     throw std::runtime_error("Input data path should be given for both first_model and "
                              "second_model, or neither must be given.");
 
-  metric = arser.get<std::string>("--metric");
+  if (arser["--dump_output_with_prefix"])
+    output_prefix = arser.get<std::string>("--dump_output_with_prefix");
+
+  // Set Metrics
+  std::vector<Metric> metrics;
+  if (arser["--print_mae"] and arser.get<bool>("--print_mae"))
+  {
+    metrics.emplace_back(Metric::MAE);
+  }
+  if (arser["--print_mape"] and arser.get<bool>("--print_mape"))
+  {
+    metrics.emplace_back(Metric::MAPE);
+  }
+  if (arser["--print_mpeir"] and arser.get<bool>("--print_mpeir"))
+  {
+    metrics.emplace_back(Metric::MPEIR);
+  }
+  if (arser["--print_top1_match"] and arser.get<bool>("--print_top1_match"))
+  {
+    metrics.emplace_back(Metric::MTOP1);
+  }
+  if (arser["--print_top5_match"] and arser.get<bool>("--print_top5_match"))
+  {
+    metrics.emplace_back(Metric::MTOP5);
+  }
+  if (arser["--print_mse"] and arser.get<bool>("--print_mse"))
+  {
+    metrics.emplace_back(Metric::MSE);
+  }
+
   input_data_format = arser.get<std::string>("--input_data_format");
 
   auto ctx = std::make_unique<CircleEvalDiff::Context>();
   {
     ctx->first_model_path = first_model_path;
     ctx->second_model_path = second_model_path;
-    ctx->metric = to_metric(metric);
+    ctx->first_input_data_path = first_input_data_path;
+    ctx->second_input_data_path = second_input_data_path;
+    ctx->metric = metrics;
     ctx->input_format = to_input_format(input_data_format);
+    ctx->output_prefix = output_prefix;
   }
 
   CircleEvalDiff ced(std::move(ctx));
 
   ced.init();
 
-  ced.evalDiff(first_input_data_path, second_input_data_path);
+  ced.evalDiff();
 
   return EXIT_SUCCESS;
 }
diff --git a/compiler/circle-eval-diff/include/CircleEvalDiff.h b/compiler/circle-eval-diff/include/CircleEvalDiff.h
index bf6aff46d..7894480ac 100644
--- a/compiler/circle-eval-diff/include/CircleEvalDiff.h
+++ b/compiler/circle-eval-diff/include/CircleEvalDiff.h
@@ -20,8 +20,12 @@
 #include <luci/IR/Module.h>
 #include <luci_interpreter/Interpreter.h>
 
+#include "InputDataLoader.h"
+#include "MetricPrinter.h"
+
 #include <string>
 #include <memory>
+#include <vector>
 
 namespace circle_eval_diff
 {
@@ -32,14 +36,12 @@ class ModuleEvalDiff;
 enum class Metric
 {
   Undefined, // For debugging
-  MAE,
-};
-
-enum class InputFormat
-{
-  Undefined, // For debugging
-  H5,
-  // TODO Implement Random, Directory
+  MAE,       // Mean Absolute Error
+  MAPE,      // Mean Percentage Absolute Error
+  MPEIR,     // Mean Peak Error to Interval Ratio
+  MTOP1,     // Mean Top-1 Match Ratio
+  MTOP5,     // Mean Top-5 Match Ratio
+  MSE,       // Mean Squared Error
 };
 
 class CircleEvalDiff final
@@ -49,8 +51,11 @@ public:
   {
     std::string first_model_path;
     std::string second_model_path;
-    Metric metric = Metric::Undefined;
+    std::string first_input_data_path;
+    std::string second_input_data_path;
+    std::vector<Metric> metric;
     InputFormat input_format = InputFormat::Undefined;
+    std::string output_prefix;
   };
 
 public:
@@ -61,12 +66,13 @@ public:
   void init();
 
   // Evaluate two circle models for the given input data and compare the results
-  void evalDiff(const std::string &first_input_data_path,
-                const std::string &second_input_data_path) const;
+  void evalDiff(void) const;
 
 private:
   std::unique_ptr<Context> _ctx;
-  std::unique_ptr<ModuleEvalDiff> _runner;
+  std::unique_ptr<luci::Module> _first_module;
+  std::unique_ptr<luci::Module> _second_module;
+  std::vector<std::unique_ptr<MetricPrinter>> _metrics;
 };
 
 } // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/CircleEvalDiff.cpp b/compiler/circle-eval-diff/src/CircleEvalDiff.cpp
index c39a11371..43e026bf6 100644
--- a/compiler/circle-eval-diff/src/CircleEvalDiff.cpp
+++ b/compiler/circle-eval-diff/src/CircleEvalDiff.cpp
@@ -15,8 +15,9 @@
  */
 
 #include "CircleEvalDiff.h"
-#include "ModuleEvalDiff.h"
+#include "InputDataLoader.h"
 #include "MetricPrinter.h"
+#include "Tensor.h"
 
 #include <foder/FileLoader.h>
 #include <luci/Importer.h>
@@ -26,6 +27,25 @@
 namespace
 {
 
+bool same_shape(const luci::CircleNode *a, const luci::CircleNode *b)
+{
+  if (a->rank() != b->rank())
+    return false;
+
+  for (uint32_t i = 0; i < a->rank(); i++)
+  {
+    if (not(a->dim(i) == b->dim(i)))
+      return false;
+  }
+
+  return true;
+}
+
+bool same_dtype(const luci::CircleNode *a, const luci::CircleNode *b)
+{
+  return a->dtype() == b->dtype();
+}
+
 std::unique_ptr<luci::Module> import(const std::string &model_path)
 {
   // Load model from the file
@@ -40,7 +60,12 @@ std::unique_ptr<luci::Module> import(const std::string &model_path)
     throw std::runtime_error("Failed to verify circle '" + model_path + "'");
   }
 
-  auto module = luci::Importer().importModule(circle::GetModel(model_data.data()));
+  auto circle_model = circle::GetModel(model_data.data());
+
+  if (not circle_model)
+    throw std::runtime_error("Failed to load '" + model_path + "'");
+
+  auto module = luci::Importer().importModule(circle_model);
 
   if (not module)
     throw std::runtime_error("Failed to load '" + model_path + "'");
@@ -48,50 +73,192 @@ std::unique_ptr<luci::Module> import(const std::string &model_path)
   return module;
 }
 
+const std::vector<loco::Node *> inputs_of(const luci::Module *module)
+{
+  return loco::input_nodes(module->graph());
+}
+
+const std::vector<loco::Node *> outputs_of(const luci::Module *module)
+{
+  return loco::output_nodes(module->graph());
+}
+
+void writeDataToFile(const std::string &filename, const char *data, size_t data_size)
+{
+  std::ofstream fs(filename, std::ofstream::binary);
+  if (fs.fail())
+    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
+  if (fs.write(data, data_size).fail())
+  {
+    throw std::runtime_error("Failed to write data to file \"" + filename + "\".\n");
+  }
+}
+
+void checkOutputs(const luci::Module *first, const luci::Module *second)
+{
+  const auto first_output = outputs_of(first);
+  const auto second_output = outputs_of(second);
+
+  if (first_output.size() != second_output.size())
+    throw std::runtime_error("Models have different output counts");
+
+  for (uint32_t i = 0; i < first_output.size(); i++)
+  {
+    const auto first_node = loco::must_cast<luci::CircleNode *>(first_output[i]);
+    const auto second_node = loco::must_cast<luci::CircleNode *>(second_output[i]);
+
+    if (not same_shape(first_node, second_node))
+      throw std::runtime_error("Output shape mismatch (" + first_node->name() + ", " +
+                               second_node->name() + ")");
+
+    if (not same_dtype(first_node, second_node))
+      throw std::runtime_error("Output dtype mismatch (" + first_node->name() + ", " +
+                               second_node->name() + ")");
+  }
+}
+
 } // namespace
 
 namespace circle_eval_diff
 {
 
-CircleEvalDiff::CircleEvalDiff(std::unique_ptr<Context> &&ctx)
-  : _ctx(std::move(ctx)), _runner(nullptr)
+std::vector<std::shared_ptr<Tensor>> interpret(const luci::Module *module,
+                                               const InputDataLoader::Data &data)
+{
+  auto interpreter = std::make_unique<luci_interpreter::Interpreter>(module);
+
+  auto input_nodes = ::inputs_of(module);
+  auto output_nodes = ::outputs_of(module);
+
+  for (uint32_t input_idx = 0; input_idx < data.size(); input_idx++)
+  {
+    auto input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
+    assert(input_node->index() == input_idx);
+
+    auto input_data = data.at(input_idx);
+    interpreter->writeInputTensor(input_node, input_data.buffer(), input_data.byte_size());
+  }
+
+  interpreter->interpret();
+
+  std::vector<std::shared_ptr<Tensor>> outputs;
+  for (uint32_t output_idx = 0; output_idx < output_nodes.size(); output_idx++)
+  {
+    auto output_node = loco::must_cast<const luci::CircleOutput *>(output_nodes[output_idx]);
+    assert(output_node->index() == output_idx);
+
+    auto tensor = createEmptyTensor(output_node);
+    interpreter->readOutputTensor(output_node, tensor->buffer(), tensor->byte_size());
+    outputs.emplace_back(tensor);
+  }
+
+  return outputs;
+}
+
+CircleEvalDiff::CircleEvalDiff(std::unique_ptr<Context> &&ctx) : _ctx(std::move(ctx))
 {
+  // DO NOTHING
 }
 
 CircleEvalDiff::~CircleEvalDiff() = default;
 
 void CircleEvalDiff::init()
 {
+  _first_module = import(_ctx->first_model_path);
+  _second_module = import(_ctx->second_model_path);
+
+  // Check modules have the same output signature (dtype/shape)
+  // Exception will be thrown if they have different signature
+  checkOutputs(_first_module.get(), _second_module.get());
+
   // Set metric
   std::unique_ptr<MetricPrinter> metric;
-  switch (_ctx->metric)
+  for (auto metric : _ctx->metric)
   {
-    case Metric::MAE:
-      metric = std::make_unique<MAEPrinter>();
-      break;
-    default:
-      throw std::runtime_error("Unsupported metric.");
+    switch (metric)
+    {
+      case Metric::MAE:
+      {
+        _metrics.emplace_back(std::make_unique<MAEPrinter>());
+        break;
+      }
+      case Metric::MAPE:
+      {
+        _metrics.emplace_back(std::make_unique<MAPEPrinter>());
+        break;
+      }
+      case Metric::MPEIR:
+      {
+        _metrics.emplace_back(std::make_unique<MPEIRPrinter>());
+        break;
+      }
+      case Metric::MTOP1:
+      {
+        _metrics.emplace_back(std::make_unique<TopKMatchPrinter>(1));
+        break;
+      }
+      case Metric::MTOP5:
+      {
+        _metrics.emplace_back(std::make_unique<TopKMatchPrinter>(5));
+        break;
+      }
+      case Metric::MSE:
+      {
+        _metrics.emplace_back(std::make_unique<MSEPrinter>());
+        break;
+      }
+      default:
+        throw std::runtime_error("Unsupported metric.");
+    }
+    _metrics.back()->init(_first_module.get(), _second_module.get());
   }
+}
 
-  auto first_module = import(_ctx->first_model_path);
-  auto second_module = import(_ctx->second_model_path);
+void CircleEvalDiff::evalDiff(void) const
+{
+  auto first_input_loader = circle_eval_diff::makeDataLoader(
+    _ctx->first_input_data_path, _ctx->input_format, ::inputs_of(_first_module.get()));
+  auto second_input_loader = circle_eval_diff::makeDataLoader(
+    _ctx->second_input_data_path, _ctx->input_format, ::inputs_of(_second_module.get()));
 
-  // Set runner
-  switch (_ctx->input_format)
+  for (uint32_t data_idx = 0; data_idx < first_input_loader->size(); data_idx++)
   {
-    case InputFormat::H5:
-      _runner = std::make_unique<H5InputEvalDiff>(std::move(first_module), std::move(second_module),
-                                                  std::move(metric));
-      break;
-    default:
-      throw std::runtime_error("Unsupported input format.");
+    std::cout << "Evaluating " << data_idx << "'th data" << std::endl;
+
+    auto first_data = first_input_loader->get(data_idx);
+    auto second_data = second_input_loader->get(data_idx);
+
+    auto first_output = interpret(_first_module.get(), first_data);
+    auto second_output = interpret(_second_module.get(), second_data);
+
+    for (auto &metric : _metrics)
+    {
+      metric->accumulate(first_output, second_output);
+    }
+
+    if (_ctx.get()->output_prefix.empty())
+      continue;
+
+    for (uint32_t i = 0; i < first_output.size(); i++)
+    {
+      auto out = first_output[i];
+      writeDataToFile(_ctx.get()->output_prefix + "." + std::to_string(data_idx) + ".first.output" +
+                        std::to_string(i),
+                      (char *)(out->buffer()), out->byte_size());
+    }
+    for (uint32_t i = 0; i < second_output.size(); i++)
+    {
+      auto out = second_output[i];
+      writeDataToFile(_ctx.get()->output_prefix + "." + std::to_string(data_idx) +
+                        ".second.output" + std::to_string(i),
+                      (char *)(out->buffer()), out->byte_size());
+    }
   }
-}
 
-void CircleEvalDiff::evalDiff(const std::string &first_input_data_path,
-                              const std::string &second_input_data_path) const
-{
-  _runner->evalDiff(first_input_data_path, second_input_data_path);
+  for (auto &metric : _metrics)
+  {
+    std::cout << metric.get() << std::endl;
+  }
 }
 
 } // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/InputDataLoader.cpp b/compiler/circle-eval-diff/src/InputDataLoader.cpp
new file mode 100644
index 000000000..99276f32a
--- /dev/null
+++ b/compiler/circle-eval-diff/src/InputDataLoader.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "InputDataLoader.h"
+
+#include <dio_hdf5/HDF5Importer.h>
+#include <loco/IR/Graph.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <cstring>
+#include <dirent.h>
+#include <fstream>
+#include <vector>
+
+using DataType = loco::DataType;
+using Shape = std::vector<loco::Dimension>;
+
+namespace circle_eval_diff
+{
+
+// Check the type and the shape of CircleInput
+void verifyTypeShape(const luci::CircleInput *input_node, const DataType &dtype, const Shape &shape)
+{
+  // Type check
+  if (dtype != input_node->dtype())
+    throw std::runtime_error("Wrong input type.");
+
+  if (shape.size() != input_node->rank())
+    throw std::runtime_error("Input rank mismatch.");
+
+  for (uint32_t i = 0; i < shape.size(); i++)
+  {
+    if (not(shape.at(i) == input_node->dim(i)))
+      throw std::runtime_error("Input shape mismatch.");
+  }
+}
+
+std::vector<size_t> getEachByteSizeOf(const std::vector<loco::Node *> &nodes)
+{
+  std::vector<size_t> vec;
+
+  for (const auto node : nodes)
+  {
+    const auto input_node = loco::must_cast<const luci::CircleInput *>(node);
+    size_t element_size = 1;
+
+    for (uint32_t index = 0; index < input_node->rank(); index++)
+    {
+      element_size *= input_node->dim(index).value();
+    }
+
+    vec.push_back(element_size);
+  }
+
+  return vec;
+}
+
+size_t getTotalByteSizeOf(const std::vector<loco::Node *> &nodes)
+{
+  size_t total_byte_size = 0;
+
+  for (const auto node : nodes)
+  {
+    const auto input_node = loco::must_cast<const luci::CircleInput *>(node);
+    size_t byte_size = loco::size(input_node->dtype());
+
+    for (uint32_t index = 0; index < input_node->rank(); index++)
+    {
+      byte_size *= input_node->dim(index).value();
+    }
+
+    total_byte_size += byte_size;
+  }
+
+  return total_byte_size;
+}
+
+} // namespace circle_eval_diff
+
+namespace circle_eval_diff
+{
+
+HDF5Loader::HDF5Loader(const std::string &file_path, const std::vector<loco::Node *> &input_nodes)
+  : _input_nodes{input_nodes}
+{
+  try
+  {
+    using HDF5Importer = dio::hdf5::HDF5Importer;
+
+    _hdf5 = std::make_unique<HDF5Importer>(file_path);
+    _hdf5->importGroup("value");
+  }
+  catch (const H5::Exception &e)
+  {
+    H5::Exception::printErrorStack();
+    throw std::runtime_error("HDF5 error occurred.");
+  }
+}
+
+uint32_t HDF5Loader::size(void) const { return _hdf5->numData(); }
+
+InputDataLoader::Data HDF5Loader::get(uint32_t data_idx) const
+{
+  Data data;
+  data.resize(_input_nodes.size());
+
+  for (uint32_t input_idx = 0; input_idx < _input_nodes.size(); input_idx++)
+  {
+    auto input_node = loco::must_cast<luci::CircleInput *>(_input_nodes.at(input_idx));
+    assert(input_node->index() == input_idx);
+
+    data.at(input_idx) = *createEmptyTensor(input_node).get();
+
+    auto input_buffer = data.at(input_idx).buffer();
+    try
+    {
+      if (_hdf5->isRawData())
+      {
+        _hdf5->readTensor(data_idx, input_idx, input_buffer);
+      }
+      else
+      {
+        DataType dtype;
+        Shape shape;
+        _hdf5->readTensor(data_idx, input_idx, &dtype, &shape, input_buffer);
+
+        // Check the type and the shape of the input data is valid
+        verifyTypeShape(input_node, dtype, shape);
+      }
+    }
+    catch (const H5::Exception &e)
+    {
+      H5::Exception::printErrorStack();
+      throw std::runtime_error("HDF5 error occurred.");
+    }
+  }
+
+  return data;
+}
+
+DirectoryLoader::DirectoryLoader(const std::string &dir_path,
+                                 const std::vector<loco::Node *> &input_nodes)
+  : _input_nodes{input_nodes}
+{
+  DIR *dir = opendir(dir_path.c_str());
+  if (not dir)
+  {
+    throw std::runtime_error("Cannot open directory \"" + dir_path + "\".");
+  }
+
+  struct dirent *entry = nullptr;
+  const auto input_total_bytes = getTotalByteSizeOf(input_nodes);
+  while (entry = readdir(dir))
+  {
+    // Skip if the entry is not a regular file
+    if (entry->d_type != DT_REG)
+      continue;
+
+    _data_paths.push_back(dir_path + "/" + entry->d_name);
+  }
+
+  closedir(dir);
+}
+
+uint32_t DirectoryLoader::size(void) const { return _data_paths.size(); }
+
+InputDataLoader::Data DirectoryLoader::get(uint32_t data_idx) const
+{
+  // Read raw data
+  const auto input_total_bytes = getTotalByteSizeOf(_input_nodes);
+  std::vector<char> input_data(input_total_bytes);
+  const auto raw_data_path = _data_paths.at(data_idx);
+  std::ifstream fs(raw_data_path, std::ifstream::binary);
+
+  if (fs.fail())
+  {
+    throw std::runtime_error("Cannot open file \"" + raw_data_path + "\".");
+  }
+  if (fs.read(input_data.data(), input_total_bytes).fail())
+  {
+    throw std::runtime_error("Failed to read raw data from file \"" + raw_data_path + "\".");
+  }
+
+  // Make Tensor from raw data
+  auto input_data_cur = input_data.data();
+
+  Data data;
+  data.resize(_input_nodes.size());
+  std::vector<size_t> input_bytes = getEachByteSizeOf(_input_nodes);
+  for (uint32_t index = 0; index < _input_nodes.size(); index++)
+  {
+    const auto input_node = loco::must_cast<const luci::CircleInput *>(_input_nodes.at(index));
+    auto &tensor = data.at(index);
+    tensor = *createEmptyTensor(input_node).get();
+    auto buffer = tensor.buffer();
+    std::memcpy(buffer, input_data_cur, input_bytes.at(index));
+    input_data_cur += input_bytes.at(index);
+  }
+
+  return data;
+}
+
+std::unique_ptr<InputDataLoader> makeDataLoader(const std::string &file_path,
+                                                const InputFormat &format,
+                                                const std::vector<loco::Node *> &input_nodes)
+{
+  switch (format)
+  {
+    case InputFormat::H5:
+    {
+      return std::make_unique<HDF5Loader>(file_path, input_nodes);
+    }
+    case InputFormat::DIR:
+    {
+      return std::make_unique<DirectoryLoader>(file_path, input_nodes);
+    }
+    default:
+      throw std::runtime_error{"Unsupported input format."};
+  }
+}
+
+} // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/InputDataLoader.h b/compiler/circle-eval-diff/src/InputDataLoader.h
new file mode 100644
index 000000000..14921b239
--- /dev/null
+++ b/compiler/circle-eval-diff/src/InputDataLoader.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_EVAL_DIFF_INPUT_DATA_LOADER_H__
+#define __CIRCLE_EVAL_DIFF_INPUT_DATA_LOADER_H__
+
+#include <dio_hdf5/HDF5Importer.h>
+#include <loco/IR/Node.h>
+#include <luci/IR/CircleNodes.h>
+
+#include "Tensor.h"
+
+#include <memory>
+#include <string>
+
+namespace circle_eval_diff
+{
+
+void verifyTypeShape(const luci::CircleInput *input_node, const loco::DataType &dtype,
+                     const std::vector<loco::Dimension> &shape);
+
+} // namespace circle_eval_diff
+
+namespace circle_eval_diff
+{
+
+enum class InputFormat
+{
+  Undefined, // For debugging
+  H5,
+  DIR, // directory
+  // TODO Implement Random, Directory
+};
+
+class InputDataLoader
+{
+public:
+  using Data = std::vector<Tensor>;
+
+public:
+  virtual ~InputDataLoader() = default;
+
+public:
+  virtual uint32_t size(void) const = 0;
+
+public:
+  virtual Data get(uint32_t data_idx) const = 0;
+};
+
+class HDF5Loader final : public InputDataLoader
+{
+public:
+  HDF5Loader(const std::string &file_path, const std::vector<loco::Node *> &input_nodes);
+
+public:
+  uint32_t size(void) const final;
+  Data get(uint32_t data_idx) const final;
+
+private:
+  const std::vector<loco::Node *> _input_nodes;
+  std::unique_ptr<dio::hdf5::HDF5Importer> _hdf5;
+};
+
+// This class loads the directory that has raw data binary files.
+class DirectoryLoader final : public InputDataLoader
+{
+public:
+  DirectoryLoader(const std::string &dir_path, const std::vector<loco::Node *> &input_nodes);
+
+public:
+  uint32_t size(void) const final;
+  Data get(uint32_t data_idx) const final;
+
+private:
+  const std::vector<loco::Node *> _input_nodes;
+  std::vector<std::string> _data_paths;
+};
+
+std::unique_ptr<InputDataLoader> makeDataLoader(const std::string &file_path,
+                                                const InputFormat &format,
+                                                const std::vector<loco::Node *> &input_nodes);
+
+} // namespace circle_eval_diff
+
+#endif // __CIRCLE_EVAL_DIFF_INPUT_DATA_LOADER_H__
diff --git a/compiler/circle-eval-diff/src/InputDataLoader.test.cpp b/compiler/circle-eval-diff/src/InputDataLoader.test.cpp
new file mode 100644
index 000000000..cbe78797b
--- /dev/null
+++ b/compiler/circle-eval-diff/src/InputDataLoader.test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <luci/IR/CircleNodes.h>
+
+#include "InputDataLoader.h"
+
+using namespace circle_eval_diff;
+
+TEST(CircleEvalInputDataLoaderTest, verifyTypeShapeTest)
+{
+  luci::CircleInput input;
+  input.dtype(loco::DataType::FLOAT32);
+  input.rank(4);
+  input.dim(0).set(1);
+  input.dim(1).set(3);
+  input.dim(2).set(3);
+  input.dim(3).set(2);
+
+  loco::DataType right_data_type{loco::DataType::FLOAT32};
+  std::vector<loco::Dimension> right_shape;
+  right_shape.emplace_back(1);
+  right_shape.emplace_back(3);
+  right_shape.emplace_back(3);
+  right_shape.emplace_back(2);
+
+  EXPECT_NO_THROW(verifyTypeShape(&input, right_data_type, right_shape));
+}
+
+TEST(CircleEvalInputDataLoaderTest, verifyTypeShapeTest_NEG)
+{
+  luci::CircleInput input;
+  input.dtype(loco::DataType::FLOAT32);
+  input.rank(4);
+  input.dim(0).set(1);
+  input.dim(1).set(4);
+  input.dim(2).set(4);
+  input.dim(3).set(2);
+
+  loco::DataType right_data_type{loco::DataType::FLOAT32};
+  loco::DataType wrong_data_type{loco::DataType::FLOAT16};
+  std::vector<loco::Dimension> wrong_shape;
+  wrong_shape.emplace_back(1);
+  wrong_shape.emplace_back(3);
+  wrong_shape.emplace_back(3);
+  wrong_shape.emplace_back(2);
+
+  EXPECT_ANY_THROW(verifyTypeShape(&input, right_data_type, wrong_shape));
+  EXPECT_ANY_THROW(verifyTypeShape(&input, wrong_data_type, wrong_shape));
+}
diff --git a/compiler/circle-eval-diff/src/MetricPrinter.cpp b/compiler/circle-eval-diff/src/MetricPrinter.cpp
index d65eb9b63..ec8408471 100644
--- a/compiler/circle-eval-diff/src/MetricPrinter.cpp
+++ b/compiler/circle-eval-diff/src/MetricPrinter.cpp
@@ -18,6 +18,7 @@
 
 #include <luci/IR/CircleNode.h>
 
+#include <limits>
 #include <iostream>
 #include <cassert>
 
@@ -30,6 +31,16 @@ using Tensor = circle_eval_diff::Tensor;
 namespace
 {
 
+uint32_t num_elems(const luci::CircleNode *node)
+{
+  uint32_t res = 1;
+
+  for (uint32_t i = 0; i < node->rank(); i++)
+    res *= node->dim(i).value();
+
+  return res;
+}
+
 template <typename T> bool same_shape(const T a, const T b)
 {
   if (a->rank() != b->rank())
@@ -44,6 +55,8 @@ template <typename T> bool same_shape(const T a, const T b)
   return true;
 }
 
+template <typename T> bool same_dtype(const T a, const T b) { return a->dtype() == b->dtype(); }
+
 template <loco::DataType DT> std::shared_ptr<Tensor> to_fp32(const std::shared_ptr<Tensor> &tensor)
 {
   assert(tensor->dtype() == DT); // FIX_CALLER_UNLESS
@@ -97,7 +110,6 @@ void MAEPrinter::init(const luci::Module *first, const luci::Module *second)
   {
     const auto first_node = loco::must_cast<luci::CircleNode *>(first_output[i]);
     const auto second_node = loco::must_cast<luci::CircleNode *>(second_output[i]);
-    assert(same_shape(first_node, second_node)); // FIX_CALLER_UNLESS
 
     // Create tensors to store intermediate results
     _intermediate.emplace_back();
@@ -180,6 +192,471 @@ void MAEPrinter::dump(std::ostream &os) const
   }
 }
 
+// TODO Remove duplicate codes with MAEPrinter
+void MAPEPrinter::init(const luci::Module *first, const luci::Module *second)
+{
+  THROW_UNLESS(first != nullptr, "Invalid module.");
+  THROW_UNLESS(second != nullptr, "Invalid module.");
+
+  const auto first_output = loco::output_nodes(first->graph());
+  const auto second_output = loco::output_nodes(second->graph());
+
+  assert(first_output.size() == second_output.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < first_output.size(); i++)
+  {
+    const auto first_node = loco::must_cast<luci::CircleNode *>(first_output[i]);
+    const auto second_node = loco::must_cast<luci::CircleNode *>(second_output[i]);
+
+    // Create tensors to store intermediate results
+    _intermediate.emplace_back();
+    _intermediate.at(i).dtype(loco::DataType::FLOAT32);
+    // NOTE Use both first_node and second_node to avoid release build break
+    _intermediate.at(i).rank(first_node->rank());
+    uint32_t num_elems = 1;
+    for (uint32_t j = 0; j < second_node->rank(); j++)
+    {
+      _intermediate.at(i).dim(j) = second_node->dim(j);
+      num_elems *= second_node->dim(j).value();
+    }
+    _intermediate.at(i).size<loco::DataType::FLOAT32>(num_elems);
+
+    // Check the buffer is initilized with zero
+    for (uint32_t j = 0; j < num_elems; j++)
+      assert(_intermediate.at(i).at<loco::DataType::FLOAT32>(j) == 0.0);
+
+    // Save output names for logging
+    _output_names.emplace_back(first_node->name());
+  }
+}
+
+// Accumulate |(a - b) / a|
+void MAPEPrinter::accum_mean_absolute_error(uint32_t output_idx, const std::shared_ptr<Tensor> &a,
+                                            const std::shared_ptr<Tensor> &b)
+{
+  assert(a->dtype() == loco::DataType::FLOAT32 and
+         b->dtype() == loco::DataType::FLOAT32); // FIX_CALLER_UNLESS
+  assert(same_shape(a.get(), b.get()));          // FIX_CALLER_UNLESS
+  assert(output_idx < _intermediate.size());     // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < a->size<loco::DataType::FLOAT32>(); i++)
+  {
+    const auto a_val = a->at<loco::DataType::FLOAT32>(i);
+    const auto b_val = b->at<loco::DataType::FLOAT32>(i);
+    _intermediate.at(output_idx).at<loco::DataType::FLOAT32>(i) +=
+      std::abs((a_val - b_val) / a_val);
+  }
+}
+
+// Assumption
+// first: the result of fp32 model
+// second: the result of fake-quantized model
+void MAPEPrinter::accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                             const std::vector<std::shared_ptr<Tensor>> &second)
+{
+  assert(first.size() == second.size());        // FIX_CALLER_UNLESS
+  assert(first.size() == _intermediate.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto first_output = first[output_idx];
+    const auto second_output = second[output_idx];
+
+    // Cast data to fp32 and then compute absolute error
+    const auto fp32_first_output = fp32(first_output);
+    const auto fp32_second_output = fp32(second_output);
+
+    accum_mean_absolute_error(output_idx, fp32_first_output, fp32_second_output);
+  }
+
+  _num_data++;
+}
+
+void MAPEPrinter::dump(std::ostream &os) const
+{
+  os << "Mean Absolute Percentage Error (MAPE)" << std::endl;
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto name = _output_names.at(output_idx);
+    const auto &inter = _intermediate.at(output_idx);
+    assert(inter.dtype() == loco::DataType::FLOAT32); // FIX_ME_UNLESS
+    const auto elem_count = inter.size<loco::DataType::FLOAT32>();
+
+    // Compute MAPE
+    float mape = 0.0;
+    for (uint32_t elem_idx = 0; elem_idx < elem_count; elem_idx++)
+      mape += inter.at<loco::DataType::FLOAT32>(elem_idx);
+
+    mape = mape / elem_count;
+    mape = mape / _num_data;
+    mape *= 100.0;
+
+    os << "MAPE for " << name << " is " << mape << "%" << std::endl;
+  }
+}
+
+// TODO Remove duplicate codes with MAEPrinter
+void MPEIRPrinter::init(const luci::Module *first, const luci::Module *second)
+{
+  THROW_UNLESS(first != nullptr, "Invalid module.");
+  THROW_UNLESS(second != nullptr, "Invalid module.");
+
+  const auto first_output = loco::output_nodes(first->graph());
+  const auto second_output = loco::output_nodes(second->graph());
+
+  assert(first_output.size() == second_output.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < first_output.size(); i++)
+  {
+    const auto first_node = loco::must_cast<luci::CircleOutput *>(first_output[i]);
+    const auto second_node = loco::must_cast<luci::CircleOutput *>(second_output[i]);
+
+    // Create places to store intermediate results
+    _intermediate.emplace_back(0.0);
+
+    // Save output names for logging
+    _output_names.emplace_back(first_node->name());
+  }
+}
+
+// Accumulate PEIR (Peak Error to Interval Ratio)
+// PEIR = max(|a - b|) / (max(a) - min(a))
+// PEIR >= 0 (lower is better)
+void MPEIRPrinter::accum_peir(uint32_t output_idx, const std::shared_ptr<Tensor> &a,
+                              const std::shared_ptr<Tensor> &b)
+{
+  assert(a->dtype() == loco::DataType::FLOAT32 and
+         b->dtype() == loco::DataType::FLOAT32); // FIX_CALLER_UNLESS
+  assert(same_shape(a.get(), b.get()));          // FIX_CALLER_UNLESS
+  assert(output_idx < _intermediate.size());     // FIX_CALLER_UNLESS
+
+  float min = std::numeric_limits<float>::max();
+  float max = std::numeric_limits<float>::lowest();
+
+  for (uint32_t i = 0; i < a->size<loco::DataType::FLOAT32>(); i++)
+  {
+    const auto a_val = a->at<loco::DataType::FLOAT32>(i);
+    min = std::min(a_val, min);
+    max = std::max(a_val, max);
+  }
+
+  float interval = max - min;
+
+  // Corner case: All values are the same. We set interval = 1 in this case
+  if (interval == 0)
+    interval = 1.0;
+
+  float peak_error = std::numeric_limits<float>::lowest();
+
+  for (uint32_t i = 0; i < a->size<loco::DataType::FLOAT32>(); i++)
+  {
+    const auto a_val = a->at<loco::DataType::FLOAT32>(i);
+    const auto b_val = b->at<loco::DataType::FLOAT32>(i);
+    const auto error = std::abs(a_val - b_val);
+    peak_error = std::max(error, peak_error);
+  }
+
+  _intermediate.at(output_idx) += peak_error / interval;
+}
+
+// Assumption (when testing the accuracy of quantized model)
+// first: the result of fp32 model
+// second: the result of fake-quantized model
+void MPEIRPrinter::accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                              const std::vector<std::shared_ptr<Tensor>> &second)
+{
+  assert(first.size() == second.size());        // FIX_CALLER_UNLESS
+  assert(first.size() == _intermediate.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto first_output = first[output_idx];
+    const auto second_output = second[output_idx];
+
+    // Cast data to fp32 for ease of computation
+    const auto fp32_first_output = fp32(first_output);
+    const auto fp32_second_output = fp32(second_output);
+
+    accum_peir(output_idx, fp32_first_output, fp32_second_output);
+  }
+
+  _num_data++;
+}
+
+void MPEIRPrinter::dump(std::ostream &os) const
+{
+  os << "Mean Peak Error to Interval Ratio (MPEIR)" << std::endl;
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto name = _output_names.at(output_idx);
+    const auto sum_of_peir = _intermediate.at(output_idx);
+
+    // Compute MPEIR
+    float mpeir = sum_of_peir / _num_data;
+
+    os << "MPEIR for " << name << " is " << mpeir << std::endl;
+  }
+}
+
+// TODO Remove duplicate codes with MAEPrinter
+void TopKMatchPrinter::init(const luci::Module *first, const luci::Module *second)
+{
+  THROW_UNLESS(first != nullptr, "Invalid module.");
+  THROW_UNLESS(second != nullptr, "Invalid module.");
+
+  const auto first_output = loco::output_nodes(first->graph());
+  const auto second_output = loco::output_nodes(second->graph());
+
+  assert(first_output.size() == second_output.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < first_output.size(); i++)
+  {
+    const auto first_node = loco::must_cast<luci::CircleOutput *>(first_output[i]);
+    const auto second_node = loco::must_cast<luci::CircleOutput *>(second_output[i]);
+
+    // Create places to store intermediate results
+    _intermediate.emplace_back(0.0);
+
+    // Save output names for logging
+    _output_names.emplace_back(first_node->name());
+
+    // If num_elems of an output is less than k,
+    // the output index is added to the skip list
+    if (num_elems(first_node) < _k)
+    {
+      std::cout << "Top-" << _k << "metric for " << first_node->name()
+                << " is ignored, because it has elements less than " << _k << std::endl;
+      _skip_output.emplace_back(i);
+    }
+  }
+}
+
+void TopKMatchPrinter::accum_topk_accuracy(uint32_t output_idx, const std::shared_ptr<Tensor> &a,
+                                           const std::shared_ptr<Tensor> &b)
+{
+  assert(a->dtype() == loco::DataType::FLOAT32 and
+         b->dtype() == loco::DataType::FLOAT32); // FIX_CALLER_UNLESS
+  assert(same_shape(a.get(), b.get()));          // FIX_CALLER_UNLESS
+  assert(output_idx < _intermediate.size());     // FIX_CALLER_UNLESS
+
+  // Find Top-k largest elements
+  // This implementation is a variant of "Method 2 (Use temporary array)" in
+  // https://www.geeksforgeeks.org/k-largestor-smallest-elements-in-an-array/
+  // We sort top-k elements by value and index to ensure that the element with an earlier
+  // index comes first if multiple elements have the same value.
+  auto find_topk = [this](const std::shared_ptr<Tensor> &tensor) {
+    assert(_k <= tensor->size<loco::DataType::FLOAT32>()); // FIX_CALLER_UNLESS
+
+    // first: value, second: index
+    std::vector<std::pair<float, uint32_t>> topk;
+    topk.resize(_k);
+
+    // Initialize
+    for (uint32_t i = 0; i < _k; i++)
+    {
+      topk[i] = std::make_pair(tensor->at<loco::DataType::FLOAT32>(i), i);
+    }
+
+    // Input pair: (value, index)
+    // Return true if a has smaller value than b. If a and b have the same value,
+    // return true if a has larger index.
+    auto compare = [](const std::pair<float, uint32_t> &a, const std::pair<float, uint32_t> &b) {
+      if (a.first == b.first)
+        return a.second > b.second;
+
+      return a.first < b.first;
+    };
+
+    for (uint32_t i = _k; i < tensor->size<loco::DataType::FLOAT32>(); i++)
+    {
+      auto val = std::make_pair(tensor->at<loco::DataType::FLOAT32>(i), i);
+
+      auto min = std::min_element(topk.begin(), topk.end(), compare);
+      if (compare(*min, val))
+      {
+        // val is larger than min. Replace min with val.
+        auto min_index = std::distance(topk.begin(), min);
+        topk[min_index] = val;
+      }
+    }
+
+    return topk;
+  };
+
+  auto first_topk = find_topk(a);
+  auto second_topk = find_topk(b);
+
+  uint32_t matched = 0;
+  for (uint32_t i = 0; i < _k; i++)
+  {
+    for (uint32_t j = 0; j < _k; j++)
+    {
+      if (first_topk[i].second == second_topk[j].second)
+      {
+        matched++;
+        break;
+      }
+    }
+  }
+
+  float matched_ratio = static_cast<float>(matched) / _k;
+
+  _intermediate.at(output_idx) += matched_ratio;
+}
+
+bool TopKMatchPrinter::in_skip_list(uint32_t output_index) const
+{
+  for (auto skip : _skip_output)
+  {
+    if (output_index == skip)
+      return true;
+  }
+
+  return false;
+}
+
+void TopKMatchPrinter::accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                                  const std::vector<std::shared_ptr<Tensor>> &second)
+{
+  assert(first.size() == second.size());        // FIX_CALLER_UNLESS
+  assert(first.size() == _intermediate.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    if (in_skip_list(output_idx))
+      continue;
+
+    const auto first_output = first[output_idx];
+    const auto second_output = second[output_idx];
+
+    // Cast data to fp32 for ease of computation
+    const auto fp32_first_output = fp32(first_output);
+    const auto fp32_second_output = fp32(second_output);
+
+    accum_topk_accuracy(output_idx, fp32_first_output, fp32_second_output);
+  }
+
+  _num_data++;
+}
+
+void TopKMatchPrinter::dump(std::ostream &os) const
+{
+  os << "Ratio of Matched Indices between Top-" << _k << " results of the models" << std::endl;
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    if (in_skip_list(output_idx))
+      continue;
+
+    const auto name = _output_names.at(output_idx);
+    const auto sum_of_topk_accuracy = _intermediate.at(output_idx);
+
+    // Compute TopKMatch
+    float mean_topk = sum_of_topk_accuracy / _num_data;
+
+    os << "Mean Top-" << _k << " match ratio for " << name << " is " << mean_topk << std::endl;
+  }
+}
+
+void MSEPrinter::init(const luci::Module *first, const luci::Module *second)
+{
+  THROW_UNLESS(first != nullptr, "Invalid module.");
+  THROW_UNLESS(second != nullptr, "Invalid module.");
+
+  const auto first_output = loco::output_nodes(first->graph());
+  const auto second_output = loco::output_nodes(second->graph());
+
+  assert(first_output.size() == second_output.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < first_output.size(); i++)
+  {
+    const auto first_node = loco::must_cast<luci::CircleNode *>(first_output[i]);
+    const auto second_node = loco::must_cast<luci::CircleNode *>(second_output[i]);
+
+    // Create tensors to store intermediate results
+    _intermediate.emplace_back();
+    _intermediate.at(i).dtype(loco::DataType::FLOAT32);
+    // NOTE Use both first_node and second_node to avoid release build break
+    _intermediate.at(i).rank(first_node->rank());
+    uint32_t num_elems = 1;
+    for (uint32_t j = 0; j < second_node->rank(); j++)
+    {
+      _intermediate.at(i).dim(j) = second_node->dim(j);
+      num_elems *= second_node->dim(j).value();
+    }
+    _intermediate.at(i).size<loco::DataType::FLOAT32>(num_elems);
+
+    // Check the buffer is initilized with zero
+    for (uint32_t j = 0; j < num_elems; j++)
+      assert(_intermediate.at(i).at<loco::DataType::FLOAT32>(j) == 0.0);
+
+    // Save output names for logging
+    _output_names.emplace_back(first_node->name());
+  }
+}
+
+void MSEPrinter::accum_squared_error(uint32_t output_idx, const std::shared_ptr<Tensor> &a,
+                                     const std::shared_ptr<Tensor> &b)
+{
+  assert(a->dtype() == loco::DataType::FLOAT32 and
+         b->dtype() == loco::DataType::FLOAT32); // FIX_CALLER_UNLESS
+  assert(same_shape(a.get(), b.get()));          // FIX_CALLER_UNLESS
+  assert(output_idx < _intermediate.size());     // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < a->size<loco::DataType::FLOAT32>(); i++)
+  {
+    _intermediate.at(output_idx).at<loco::DataType::FLOAT32>(i) +=
+      (a->at<loco::DataType::FLOAT32>(i) - b->at<loco::DataType::FLOAT32>(i)) *
+      (a->at<loco::DataType::FLOAT32>(i) - b->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+void MSEPrinter::accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                            const std::vector<std::shared_ptr<Tensor>> &second)
+{
+  assert(first.size() == second.size());        // FIX_CALLER_UNLESS
+  assert(first.size() == _intermediate.size()); // FIX_CALLER_UNLESS
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto first_output = first[output_idx];
+    const auto second_output = second[output_idx];
+
+    // Cast data to fp32 and then compute absolute error
+    const auto fp32_first_output = fp32(first_output);
+    const auto fp32_second_output = fp32(second_output);
+
+    accum_squared_error(output_idx, fp32_first_output, fp32_second_output);
+  }
+
+  _num_data++;
+}
+
+void MSEPrinter::dump(std::ostream &os) const
+{
+  os << "Mean Squared Error (MSE)" << std::endl;
+
+  for (uint32_t output_idx = 0; output_idx < _intermediate.size(); output_idx++)
+  {
+    const auto name = _output_names.at(output_idx);
+    const auto &inter = _intermediate.at(output_idx);
+    assert(inter.dtype() == loco::DataType::FLOAT32); // FIX_ME_UNLESS
+    const auto elem_count = inter.size<loco::DataType::FLOAT32>();
+
+    // Compute MSE
+    float mse = 0.0;
+    for (uint32_t elem_idx = 0; elem_idx < elem_count; elem_idx++)
+      mse += inter.at<loco::DataType::FLOAT32>(elem_idx);
+
+    mse = mse / elem_count;
+    mse = mse / _num_data;
+
+    os << "MSE for " << name << " is " << mse << std::endl;
+  }
+}
+
 } // namespace circle_eval_diff
 
 #undef THROW_UNLESS
diff --git a/compiler/circle-eval-diff/src/MetricPrinter.h b/compiler/circle-eval-diff/src/MetricPrinter.h
index b51581c31..c8f27511c 100644
--- a/compiler/circle-eval-diff/src/MetricPrinter.h
+++ b/compiler/circle-eval-diff/src/MetricPrinter.h
@@ -85,6 +85,133 @@ private:
   uint32_t _num_data = 0;
 };
 
+// Mean Squared Error
+class MSEPrinter final : public MetricPrinter
+{
+public:
+  void init(const luci::Module *first, const luci::Module *second);
+
+  void accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                  const std::vector<std::shared_ptr<Tensor>> &second);
+
+  void dump(std::ostream &os) const;
+
+private:
+  void accum_squared_error(uint32_t index, const std::shared_ptr<Tensor> &a,
+                           const std::shared_ptr<Tensor> &b);
+
+private:
+  // Store accumulated sum of absolute error for each output
+  std::vector<Tensor> _intermediate;
+  std::vector<std::string> _output_names;
+  uint32_t _num_data = 0;
+};
+
+// Mean Absolute Percentage Error
+class MAPEPrinter final : public MetricPrinter
+{
+public:
+  void init(const luci::Module *first, const luci::Module *second);
+
+  void accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                  const std::vector<std::shared_ptr<Tensor>> &second);
+
+  void dump(std::ostream &os) const;
+
+private:
+  void accum_mean_absolute_error(uint32_t index, const std::shared_ptr<Tensor> &a,
+                                 const std::shared_ptr<Tensor> &b);
+
+private:
+  // Store accumulated sum of absolute error for each output
+  std::vector<Tensor> _intermediate;
+  std::vector<std::string> _output_names;
+  uint32_t _num_data = 0;
+};
+
+// Mean Peak Error to Interval Ratio (PEIR)
+// PEIR = max(|a - b|) / (max(a) - min(a))
+// PEIR >= 0 (lower is better)
+//
+// When testing the accuracy of quantized model,
+// the first model should be the original fp32 model, and
+// the second model should be the fake-quantized fp32 model
+class MPEIRPrinter final : public MetricPrinter
+{
+public:
+  void init(const luci::Module *first, const luci::Module *second);
+
+  void accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                  const std::vector<std::shared_ptr<Tensor>> &second);
+
+  void dump(std::ostream &os) const;
+
+private:
+  void accum_peir(uint32_t index, const std::shared_ptr<Tensor> &a,
+                  const std::shared_ptr<Tensor> &b);
+
+private:
+  // Store accumulated sum of PEIR for each output
+  std::vector<float> _intermediate;
+  std::vector<std::string> _output_names;
+  uint32_t _num_data = 0;
+};
+
+// Ratio of matched indices between top-k results of two models (a, b).
+//
+// top-k match = intersection(top_k_idx(a), top_k_idx(b)) / k
+// mean top-k match = sum(top-k match) / num_data
+//
+// For example,
+// num_data = 2
+// first model output = [1, 2, 3], [2, 3, 1]
+// second model output = [2, 4, 6], [3, 2, 1]
+//
+// if k = 1,
+// first model top-1 index = ([2], [1])
+// second model top-1 index = ([2], [0])
+// mean top-1 accuracy = (1 + 0) / 2 = 0.5
+//
+// if k = 2,
+// first model output = [1, 2, 3], [2, 3, 1]
+// second model output = [2, 4, 6], [3, 2, 1]
+// first model top-2 index = ([2, 1], [1, 0])
+// second model top-2 index = ([2, 1], [0, 1])
+// mean top-2 accuracy = (2 + 2) / 4 = 1
+//
+// NOTE Order of elements is ignored when comparing two top-k sets.
+// NOTE If two elements have the same value and only one can be included in top-k,
+// the one with an earlier index will be included.
+class TopKMatchPrinter : public MetricPrinter
+{
+public:
+  TopKMatchPrinter(uint32_t k) : _k(k) {}
+
+public:
+  void init(const luci::Module *first, const luci::Module *second);
+
+  void accumulate(const std::vector<std::shared_ptr<Tensor>> &first,
+                  const std::vector<std::shared_ptr<Tensor>> &second);
+
+  void dump(std::ostream &os) const;
+
+private:
+  void accum_topk_accuracy(uint32_t index, const std::shared_ptr<Tensor> &a,
+                           const std::shared_ptr<Tensor> &b);
+
+  // Return true if the output is in the skip list (_skip_output)
+  bool in_skip_list(uint32_t output_index) const;
+
+private:
+  const uint32_t _k = 0;
+  // Store accumulated accuracy
+  std::vector<float> _intermediate;
+  std::vector<std::string> _output_names;
+  uint32_t _num_data = 0;
+  // Save index of output whose num_elements is less than k
+  std::vector<uint32_t> _skip_output;
+};
+
 } // namespace circle_eval_diff
 
 #endif // __CIRCLE_EVAL_DIFF_METRIC_PRINTER_H__
diff --git a/compiler/circle-eval-diff/src/MetricPrinter.test.cpp b/compiler/circle-eval-diff/src/MetricPrinter.test.cpp
index 51ca89799..0e71b80cc 100644
--- a/compiler/circle-eval-diff/src/MetricPrinter.test.cpp
+++ b/compiler/circle-eval-diff/src/MetricPrinter.test.cpp
@@ -180,6 +180,23 @@ std::shared_ptr<Tensor> output_tensor_with_value(const luci::Module *module, flo
   return tensor;
 }
 
+std::shared_ptr<Tensor> output_tensor_with_value(const luci::Module *module,
+                                                 std::vector<float> &value)
+{
+  auto outputs = loco::output_nodes(module->graph());
+  assert(outputs.size() == 1);
+  auto output = *outputs.begin();
+  auto output_cnode = loco::must_cast<luci::CircleNode *>(output);
+  auto tensor = create_empty_tensor(output_cnode);
+  auto tensor_size = tensor->size<loco::DataType::FLOAT32>();
+  assert(tensor_size == value.size());
+  for (uint32_t i = 0; i < tensor_size; i++)
+  {
+    tensor->at<loco::DataType::FLOAT32>(i) = value[i];
+  }
+  return tensor;
+}
+
 } // namespace
 
 namespace circle_eval_diff
@@ -233,4 +250,299 @@ TEST(CircleEvalMetricPrinterTest, MAE_init_with_null_NEG)
   EXPECT_ANY_THROW(mae.init(nullptr, nullptr));
 }
 
+TEST(CircleEvalMetricPrinterTest, MAPE_simple)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  MAPEPrinter mape;
+
+  mape.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    auto output = output_tensor_with_value(&first, 2.0);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    auto output = output_tensor_with_value(&second, 1.0);
+    second_result.emplace_back(output);
+  }
+
+  mape.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  mape.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_NE(std::string::npos, result.find("MAPE for output_0 is 50%"));
+}
+
+TEST(CircleEvalMetricPrinterTest, MAPE_init_with_null_NEG)
+{
+  MAPEPrinter mape;
+
+  EXPECT_ANY_THROW(mape.init(nullptr, nullptr));
+}
+
+TEST(CircleEvalMetricPrinterTest, MPEIR_simple)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  MPEIRPrinter mpeir;
+
+  mpeir.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    std::vector<float> val;
+    val.resize(16);
+    for (uint32_t i = 0; i < 16; i++)
+      val[i] = i;
+
+    auto output = output_tensor_with_value(&first, val);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    auto output = output_tensor_with_value(&second, 0.0);
+    second_result.emplace_back(output);
+  }
+
+  mpeir.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  mpeir.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_NE(std::string::npos, result.find("MPEIR for output_0 is 1"));
+}
+
+TEST(CircleEvalMetricPrinterTest, MPEIR_init_with_null_NEG)
+{
+  MPEIRPrinter mpeir;
+
+  EXPECT_ANY_THROW(mpeir.init(nullptr, nullptr));
+}
+
+TEST(CircleEvalMetricPrinterTest, TopK_simple)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  TopKMatchPrinter top5(5);
+
+  top5.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    std::vector<float> val;
+    val.resize(16);
+    for (uint32_t i = 0; i < 16; i++)
+      val[i] = i;
+
+    auto output = output_tensor_with_value(&first, val);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    std::vector<float> val;
+    val.resize(16);
+    for (uint32_t i = 0; i < 16; i++)
+      val[i] = i * 2;
+    auto output = output_tensor_with_value(&second, val);
+    second_result.emplace_back(output);
+  }
+
+  top5.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  top5.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_NE(std::string::npos, result.find("Mean Top-5 match ratio for output_0 is 1"));
+}
+
+TEST(CircleEvalMetricPrinterTest, TopK_tie)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  TopKMatchPrinter top5(5);
+
+  top5.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    std::vector<float> val;
+    val.resize(16);
+    for (uint32_t i = 0; i < 16; i++)
+      val[i] = i;
+
+    auto output = output_tensor_with_value(&first, val);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    std::vector<float> val{12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15, 16};
+
+    auto output = output_tensor_with_value(&second, val);
+    second_result.emplace_back(output);
+  }
+
+  top5.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  top5.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_NE(std::string::npos, result.find("Mean Top-5 match ratio for output_0 is 0.8"));
+}
+
+TEST(CircleEvalMetricPrinterTest, TopK_num_elem_less_than_k_NEG)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  TopKMatchPrinter top100(100);
+
+  top100.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    auto output = output_tensor_with_value(&first, 0);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    auto output = output_tensor_with_value(&second, 0);
+    second_result.emplace_back(output);
+  }
+
+  top100.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  top100.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_EQ(std::string::npos, result.find("Mean Top-100 match ratio"));
+}
+
+TEST(CircleEvalMetricPrinterTest, TopK_init_with_null_NEG)
+{
+  TopKMatchPrinter topk(5);
+
+  EXPECT_ANY_THROW(topk.init(nullptr, nullptr));
+}
+
+TEST(CircleEvalMetricPrinterTest, MSE_simple)
+{
+  luci::Module first;
+  AddOneGraph first_g;
+  first_g.init();
+
+  first.add(std::move(first_g.graph()));
+
+  luci::Module second;
+  AddTwoGraph second_g;
+  second_g.init();
+
+  second.add(std::move(second_g.graph()));
+
+  MSEPrinter mse;
+
+  mse.init(&first, &second);
+
+  // This test does not actually evaluate the modules, but create
+  // fake results.
+  std::vector<std::shared_ptr<Tensor>> first_result;
+  {
+    auto output = output_tensor_with_value(&first, 1.0);
+    first_result.emplace_back(output);
+  }
+
+  std::vector<std::shared_ptr<Tensor>> second_result;
+  {
+    auto output = output_tensor_with_value(&second, 2.0);
+    second_result.emplace_back(output);
+  }
+
+  mse.accumulate(first_result, second_result);
+
+  std::stringstream ss;
+  mse.dump(ss);
+  std::string result = ss.str();
+
+  EXPECT_NE(std::string::npos, result.find("MSE for output_0 is 1"));
+}
+
+TEST(CircleEvalMetricPrinterTest, MSE_init_with_null_NEG)
+{
+  MSEPrinter mse;
+
+  EXPECT_ANY_THROW(mse.init(nullptr, nullptr));
+}
+
 } // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/ModuleEvalDiff.cpp b/compiler/circle-eval-diff/src/ModuleEvalDiff.cpp
deleted file mode 100644
index 85f985873..000000000
--- a/compiler/circle-eval-diff/src/ModuleEvalDiff.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ModuleEvalDiff.h"
-#include "Tensor.h"
-
-#include <luci_interpreter/Interpreter.h>
-#include <dio_hdf5/HDF5Importer.h>
-
-#include <string>
-#include <stdexcept>
-#include <iostream>
-#include <cassert>
-
-using Tensor = circle_eval_diff::Tensor;
-using DataType = loco::DataType;
-using Shape = std::vector<loco::Dimension>;
-using HDF5Importer = dio::hdf5::HDF5Importer;
-
-namespace
-{
-
-// Check the type and the shape of CircleInput
-void verifyTypeShape(const luci::CircleInput *input_node, const DataType &dtype, const Shape &shape)
-{
-  // Type check
-  if (dtype != input_node->dtype())
-    throw std::runtime_error("Wrong input type.");
-
-  if (shape.size() != input_node->rank())
-    throw std::runtime_error("Input rank mismatch.");
-
-  for (uint32_t i = 0; i < shape.size(); i++)
-  {
-    if (not(shape.at(i) == input_node->dim(i)))
-      throw std::runtime_error("Input shape mismatch.");
-  }
-}
-
-// Return number of elements of the node.
-uint32_t numElements(const luci::CircleNode *node)
-{
-  uint32_t num_elem = 1;
-  for (uint32_t i = 0; i < node->rank(); ++i)
-    num_elem *= node->dim(i).value();
-  return num_elem;
-}
-
-// Return Tensor which has the same dtype and shape with node.
-// Buffer does not have any data yet.
-std::shared_ptr<Tensor> createEmptyTensor(const luci::CircleNode *node)
-{
-  auto tensor = std::make_shared<Tensor>();
-  {
-    tensor->dtype(node->dtype());
-    tensor->rank(node->rank());
-    for (uint32_t i = 0; i < node->rank(); i++)
-      tensor->dim(i) = node->dim(i);
-
-    switch (node->dtype())
-    {
-      case loco::DataType::FLOAT32:
-        tensor->size<loco::DataType::FLOAT32>(numElements(node));
-        break;
-      case loco::DataType::U8:
-        tensor->size<loco::DataType::U8>(numElements(node));
-        break;
-      case loco::DataType::S16:
-        tensor->size<loco::DataType::S16>(numElements(node));
-        break;
-      case loco::DataType::S32:
-        tensor->size<loco::DataType::S32>(numElements(node));
-        break;
-      case loco::DataType::S64:
-        tensor->size<loco::DataType::S64>(numElements(node));
-        break;
-      default:
-        throw std::runtime_error("Unsupported input tensor dtype for " + node->name());
-    }
-  }
-
-  return tensor;
-}
-
-} // namespace
-
-namespace circle_eval_diff
-{
-
-void H5InputEvalDiff::evalDiff(const std::string &first_input_data_path,
-                               const std::string &second_input_data_path) const
-{
-  const auto interp = std::make_unique<luci_interpreter::Interpreter>(_first_module.get());
-
-  _metric->init(_first_module.get(), _second_module.get());
-
-  try
-  {
-    HDF5Importer first_h5(first_input_data_path);
-    first_h5.importGroup("value");
-
-    HDF5Importer second_h5(second_input_data_path);
-    second_h5.importGroup("value");
-
-    const auto first_num_data = first_h5.numData();
-    const auto second_num_data = second_h5.numData();
-
-    if (first_num_data != second_num_data)
-      throw std::runtime_error(
-        "Number of data in the first data file and the second data file mismatches.");
-
-    if (first_num_data == 0)
-      throw std::runtime_error("Input data file does not contain any record.");
-
-    const auto first_input_nodes = loco::input_nodes(_first_module->graph());
-    const auto first_num_inputs = first_input_nodes.size();
-    const auto first_output_nodes = loco::output_nodes(_first_module->graph());
-    const auto first_num_outputs = first_output_nodes.size();
-
-    const auto second_input_nodes = loco::input_nodes(_second_module->graph());
-    const auto second_num_inputs = second_input_nodes.size();
-    const auto second_output_nodes = loco::output_nodes(_second_module->graph());
-    const auto second_num_outputs = second_output_nodes.size();
-
-    for (int32_t data_idx = 0; data_idx < first_num_data; data_idx++)
-    {
-      std::cout << "Evaluating " << data_idx << "'th data" << std::endl;
-
-      if (first_num_inputs != first_h5.numInputs(data_idx) ||
-          second_num_inputs != second_h5.numInputs(data_idx))
-        throw std::runtime_error("Wrong number of inputs in " + std::to_string(data_idx) +
-                                 "th data.");
-
-      // Do inference and return output
-      auto eval = [&](HDF5Importer &h5, uint32_t num_inputs,
-                      const std::vector<loco::Node *> &input_nodes, uint32_t num_outputs,
-                      const std::vector<loco::Node *> &output_nodes) {
-        // Write input data
-        for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
-        {
-          const auto *input_node =
-            loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
-          assert(input_node->index() == input_idx);
-
-          auto tensor = createEmptyTensor(input_node);
-          if (h5.isRawData())
-          {
-            h5.readTensor(data_idx, input_idx, tensor->buffer());
-          }
-          else
-          {
-            DataType dtype;
-            Shape shape;
-            h5.readTensor(data_idx, input_idx, &dtype, &shape, tensor->buffer());
-
-            // Check the type and the shape of the input data is valid
-            verifyTypeShape(input_node, dtype, shape);
-          }
-
-          interp->writeInputTensor(input_node, tensor->buffer(), tensor->byte_size());
-        }
-
-        // Interpret
-        interp->interpret();
-
-        // Read output data
-        std::vector<std::shared_ptr<Tensor>> outputs;
-        for (uint32_t output_idx = 0; output_idx < num_outputs; output_idx++)
-        {
-          const auto *output_node =
-            loco::must_cast<const luci::CircleOutput *>(output_nodes[output_idx]);
-          assert(output_node->index() == output_idx);
-
-          auto tensor = createEmptyTensor(output_node);
-          interp->readOutputTensor(output_node, tensor->buffer(), tensor->byte_size());
-          outputs.emplace_back(tensor);
-        }
-
-        return outputs;
-      };
-
-      auto first_output =
-        eval(first_h5, first_num_inputs, first_input_nodes, first_num_outputs, first_output_nodes);
-      auto second_output = eval(second_h5, second_num_inputs, second_input_nodes,
-                                second_num_outputs, second_output_nodes);
-
-      // Accumulate diffs
-      _metric->accumulate(first_output, second_output);
-    }
-
-    std::cout << "Evaluation finished. Number of data: " << first_num_data << std::endl;
-  }
-  catch (const H5::Exception &e)
-  {
-    H5::Exception::printErrorStack();
-    throw std::runtime_error("HDF5 error occurred.");
-  }
-
-  // Print metric
-  std::cout << _metric.get() << std::endl;
-}
-
-} // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/ModuleEvalDiff.h b/compiler/circle-eval-diff/src/ModuleEvalDiff.h
deleted file mode 100644
index c7642f60b..000000000
--- a/compiler/circle-eval-diff/src/ModuleEvalDiff.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CIRCLE_EVAL_DIFF_MODULE_EVAL_DIFF_H__
-#define __CIRCLE_EVAL_DIFF_MODULE_EVAL_DIFF_H__
-
-#include "MetricPrinter.h"
-
-#include <luci/IR/Module.h>
-
-#include <memory>
-
-namespace circle_eval_diff
-{
-
-class ModuleEvalDiff
-{
-public:
-  ModuleEvalDiff(std::unique_ptr<luci::Module> &&first, std::unique_ptr<luci::Module> &&second,
-                 std::unique_ptr<MetricPrinter> &&metric)
-    : _first_module(std::move(first)), _second_module(std::move(second)), _metric(std::move(metric))
-  {
-  }
-
-  virtual ~ModuleEvalDiff() = default;
-
-  // Implement this in the child class
-  virtual void evalDiff(const std::string &first_input_data_path,
-                        const std::string &second_input_data_path) const = 0;
-
-protected:
-  std::unique_ptr<luci::Module> _first_module;
-  std::unique_ptr<luci::Module> _second_module;
-  std::unique_ptr<MetricPrinter> _metric;
-};
-
-class H5InputEvalDiff final : public ModuleEvalDiff
-{
-public:
-  H5InputEvalDiff(std::unique_ptr<luci::Module> &&first, std::unique_ptr<luci::Module> &&second,
-                  std::unique_ptr<MetricPrinter> &&metric)
-    : ModuleEvalDiff(std::move(first), std::move(second), std::move(metric))
-  {
-  }
-
-  void evalDiff(const std::string &first_input_data_path,
-                const std::string &second_input_data_path) const;
-};
-
-// TODO Implement ModuleEvalDiff for random input and directory input
-
-} // namespace circle_eval_diff
-
-#endif // __CIRCLE_EVAL_DIFF_MODULE_EVAL_DIFF_H__
diff --git a/compiler/circle-eval-diff/src/Tensor.cpp b/compiler/circle-eval-diff/src/Tensor.cpp
index 6710e8c3d..c3efc44cd 100644
--- a/compiler/circle-eval-diff/src/Tensor.cpp
+++ b/compiler/circle-eval-diff/src/Tensor.cpp
@@ -16,8 +16,24 @@
 
 #include "Tensor.h"
 
+#include <luci/IR/CircleNodeDecl.h>
+
 #include <cassert>
 
+namespace
+{
+
+// Return number of elements of the node.
+uint32_t numElements(const luci::CircleNode *node)
+{
+  uint32_t num_elem = 1;
+  for (uint32_t i = 0; i < node->rank(); ++i)
+    num_elem *= node->dim(i).value();
+  return num_elem;
+}
+
+} // namespace
+
 namespace circle_eval_diff
 {
 
@@ -69,4 +85,40 @@ INSTANTIATE(loco::DataType::FLOAT32);
 
 #undef INSTANTIATE
 
+// Return Tensor which has the same dtype and shape with node.
+// Buffer does not have any data yet.
+std::shared_ptr<Tensor> createEmptyTensor(const luci::CircleNode *node)
+{
+  auto tensor = std::make_shared<Tensor>();
+  {
+    tensor->dtype(node->dtype());
+    tensor->rank(node->rank());
+    for (uint32_t i = 0; i < node->rank(); i++)
+      tensor->dim(i) = node->dim(i);
+
+    switch (node->dtype())
+    {
+      case loco::DataType::FLOAT32:
+        tensor->size<loco::DataType::FLOAT32>(numElements(node));
+        break;
+      case loco::DataType::U8:
+        tensor->size<loco::DataType::U8>(numElements(node));
+        break;
+      case loco::DataType::S16:
+        tensor->size<loco::DataType::S16>(numElements(node));
+        break;
+      case loco::DataType::S32:
+        tensor->size<loco::DataType::S32>(numElements(node));
+        break;
+      case loco::DataType::S64:
+        tensor->size<loco::DataType::S64>(numElements(node));
+        break;
+      default:
+        throw std::runtime_error("Unsupported input tensor dtype for " + node->name());
+    }
+  }
+
+  return tensor;
+}
+
 } // namespace circle_eval_diff
diff --git a/compiler/circle-eval-diff/src/Tensor.h b/compiler/circle-eval-diff/src/Tensor.h
index 65ab60638..d4f65d951 100644
--- a/compiler/circle-eval-diff/src/Tensor.h
+++ b/compiler/circle-eval-diff/src/Tensor.h
@@ -18,6 +18,7 @@
 #define __CIRCLE_EVAL_DIFF_TENSOR_H__
 
 #include <loco.h>
+#include <luci/IR/CircleNodeDecl.h>
 
 #include <vector>
 
@@ -76,6 +77,8 @@ private:
   std::vector<uint8_t> _data;
 };
 
+std::shared_ptr<Tensor> createEmptyTensor(const luci::CircleNode *node);
+
 } // namespace circle_eval_diff
 
 #endif // __CIRCLE_EVAL_DIFF_TENSOR_H__
diff --git a/compiler/circle-eval-diff/src/Tensor.test.cpp b/compiler/circle-eval-diff/src/Tensor.test.cpp
index 3bdeaecdf..395865748 100644
--- a/compiler/circle-eval-diff/src/Tensor.test.cpp
+++ b/compiler/circle-eval-diff/src/Tensor.test.cpp
@@ -18,6 +18,8 @@
 
 #include <gtest/gtest.h>
 
+#include <luci/IR/CircleNodes.h>
+
 using Tensor = circle_eval_diff::Tensor;
 
 namespace
@@ -99,3 +101,29 @@ TEST(CircleEvalDiffTensorTest, out_of_buffer_range_NEG)
 
   SUCCEED();
 }
+
+TEST(CircleEvalDiffTensorTest, createEmptyTensorTest)
+{
+  luci::CircleInput input;
+  input.dtype(loco::DataType::FLOAT32);
+  input.rank(4);
+  input.dim(0).set(1);
+  input.dim(1).set(3);
+  input.dim(2).set(3);
+  input.dim(3).set(2);
+
+  loco::DataType right_data_type{loco::DataType::FLOAT32};
+  std::vector<loco::Dimension> right_shape;
+  right_shape.emplace_back(1);
+  right_shape.emplace_back(3);
+  right_shape.emplace_back(3);
+  right_shape.emplace_back(2);
+
+  auto tensor = circle_eval_diff::createEmptyTensor(&input);
+  EXPECT_EQ(loco::DataType::FLOAT32, tensor->dtype());
+  EXPECT_EQ(4, tensor->rank());
+  EXPECT_EQ(1, tensor->dim(0));
+  EXPECT_EQ(3, tensor->dim(1));
+  EXPECT_EQ(3, tensor->dim(2));
+  EXPECT_EQ(2, tensor->dim(3));
+}
diff --git a/compiler/circle-execution-plan/CMakeLists.txt b/compiler/circle-execution-plan/CMakeLists.txt
index 2f657c171..da74e021d 100644
--- a/compiler/circle-execution-plan/CMakeLists.txt
+++ b/compiler/circle-execution-plan/CMakeLists.txt
@@ -1,3 +1,9 @@
+nnas_find_package(Jsoncpp)
+if(NOT Jsoncpp_FOUND)
+    message(STATUS "Build circle-execution-plan: FAILED (missing jsoncpp)")
+    return()
+endif(NOT Jsoncpp_FOUND)
+
 set(SOURCES
         pal/IScratchpadHelper.h
         pal/ScratchpadHelperLinux.h
@@ -10,6 +16,9 @@ set(SOURCES
         )
 
 add_executable(circle_execution_plan "${SOURCES}")
+target_include_directories(circle_execution_plan PRIVATE ${Jsoncpp_INCLUDE_DIRS})
+
+target_link_libraries(circle_execution_plan ${Jsoncpp_STATIC_LIB})
 target_link_libraries(circle_execution_plan foder)
 target_link_libraries(circle_execution_plan safemain)
 target_link_libraries(circle_execution_plan luci_env)
diff --git a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
index 1788124c3..d5ddf0ce9 100644
--- a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
+++ b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
@@ -33,20 +33,22 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser("circle_execution_plan provides model with execution plan meta information");
 
-  arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
-  arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
-  arser.add_argument("--platform")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .default_value("linux")
-    .help("Platform name: linux mcu cmsisnn");
+  arser.add_argument("input").help("Input circle model");
+  arser.add_argument("output").help("Output circle model");
+  arser.add_argument("--platform").default_value("linux").help("Platform name: linux mcu cmsisnn");
   arser.add_argument("--use_dsp")
     .nargs(1)
     .type(arser::DataType::BOOL)
     .required(false)
     .default_value(false)
     .help("Plan with or without dsp (now can be used only with cmsisnn)");
+  arser.add_argument("--save_allocations")
+    .nargs(1)
+    .required(false)
+    .default_value("")
+    .help("Path for output JSON file to save memory allocation info. "
+          "Note: path end of file should have 'tracealloc.json' (example path: "
+          "'../exec_plan_info.tracealloc.json')");
 
   try
   {
@@ -63,6 +65,7 @@ int entry(int argc, char **argv)
   const std::string output_path = arser.get<std::string>("output");
   const std::string platform_name = arser.get<std::string>("--platform");
   const bool use_dsp = arser.get<bool>("--use_dsp");
+  const std::string json_path = arser.get<std::string>("--save_allocations");
 
   if (platform_name != "cmsisnn" && use_dsp)
   {
@@ -89,6 +92,13 @@ int entry(int argc, char **argv)
     return EXIT_FAILURE;
   }
 
+  bool is_save_allocations = false;
+
+  if (!json_path.empty())
+  {
+    is_save_allocations = true;
+  }
+
   foder::FileLoader file_loader{input_path};
   std::vector<char> model_data;
 
@@ -124,6 +134,9 @@ int entry(int argc, char **argv)
   circle_planner::ExecutionPlanner execution_planner(module->graph(), {platform_type, use_dsp});
   execution_planner.make_execution_plan();
 
+  if (is_save_allocations)
+    execution_planner.create_json_allocation_file(json_path);
+
   // Export to output Circle file
   luci::CircleExporter exporter;
   luci::CircleFileExpContract contract(module.get(), output_path);
diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
index ec2ec1362..a1e6f7e1a 100644
--- a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
+++ b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
@@ -18,6 +18,9 @@
 #include <loco/IR/Algorithm.h>
 #include <luci/UserSettings.h>
 
+#include <json.h>
+#include <fstream>
+
 namespace circle_planner
 {
 namespace
@@ -58,6 +61,29 @@ bool isTensorProducingNode(const luci::CircleNode *node)
   }
 }
 
+// Create allocation node part for current circle node for json allocation info file
+void create_allocation_node(Json::Value &allocations_node,
+                            AllocationNodeInformation &alloca_node_inform, uint32_t alive_till_max,
+                            luci::CircleNode *circle_node)
+{
+  Json::Value allocation_node;
+  if (alloca_node_inform.size == 0)
+    return;
+
+  allocation_node["offset"] = alloca_node_inform.offset;
+  allocation_node["size"] = alloca_node_inform.size;
+  allocation_node["alive_from"] = alloca_node_inform.first_node;
+
+  if (alloca_node_inform.last_node == node_not_assigned)
+    allocation_node["alive_till"] = alive_till_max + 1;
+  else
+    allocation_node["alive_till"] = alloca_node_inform.last_node;
+
+  allocation_node["origin"] = circle_node->name();
+
+  allocations_node.append(allocation_node);
+}
+
 } // namespace
 
 void ExecutionPlanner::make_execution_plan()
@@ -74,6 +100,50 @@ void ExecutionPlanner::make_execution_plan()
   settings->set(luci::UserSettings::Key::ExecutionPlanGen, true);
 }
 
+void ExecutionPlanner::create_json_allocation_file(const std::string &json_path)
+{
+  Json::Value main_tree;
+  Json::Value segments_node;
+  Json::Value allocations_node;
+
+  uint32_t alive_till_max = 0;
+
+  // Find max dealloc value to assign to nodes with node_not_assigned value
+  for (const auto elem : _dealloc_node)
+  {
+    if (alive_till_max < elem and elem != node_not_assigned)
+      alive_till_max = elem;
+  }
+
+  for (auto &alloc_node_inform : _alloc_node_inform_vector)
+  {
+    const auto node_num = alloc_node_inform.node_num;
+    const auto circle_node = loco::must_cast<luci::CircleNode *>(_ordered_nodes[node_num]);
+
+    create_allocation_node(allocations_node, alloc_node_inform, alive_till_max, circle_node);
+  }
+
+  // Create segment part
+  Json::Value segment_node;
+  segment_node["name"] = "Segment1";
+  segment_node["allocations"] = allocations_node;
+  segments_node.append(segment_node);
+
+  main_tree["schema_version"] = 1;
+  main_tree["segments"] = segments_node;
+
+  Json::StreamWriterBuilder builder;
+  const std::unique_ptr<Json::StreamWriter> writer(builder.newStreamWriter());
+
+  // Write to json file
+  std::ofstream out;
+  out.open(json_path);
+  if (out.is_open())
+  {
+    writer->write(main_tree, &out);
+  }
+}
+
 void ExecutionPlanner::get_default_execution_order_plan()
 {
   // Get execution order in _ordered_nodes
diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.h b/compiler/circle-execution-plan/src/ExecutionPlanner.h
index e0833c407..af3fba33e 100644
--- a/compiler/circle-execution-plan/src/ExecutionPlanner.h
+++ b/compiler/circle-execution-plan/src/ExecutionPlanner.h
@@ -104,6 +104,8 @@ public:
     _is_null_scratchpads = is_null_scratchpads;
   };
 
+  void create_json_allocation_file(const std::string &json_path);
+
 private:
   // Method gets default execution order plan and saves it in _ordered_nodes vector.
   // There can be different variants of execution order and this method provides main one.
diff --git a/compiler/circle-inspect/driver/Driver.cpp b/compiler/circle-inspect/driver/Driver.cpp
index 10e185de5..318a5826b 100644
--- a/compiler/circle-inspect/driver/Driver.cpp
+++ b/compiler/circle-inspect/driver/Driver.cpp
@@ -36,7 +36,7 @@ int entry(int argc, char **argv)
     .help("Dump Conv2D series weight operators in circle file");
   arser.add_argument("--op_version").nargs(0).help("Dump versions of the operators in circle file");
   arser.add_argument("--tensor_dtype").nargs(0).help("Dump dtype of tensors");
-  arser.add_argument("circle").type(arser::DataType::STR).help("Circle file to inspect");
+  arser.add_argument("circle").help("Circle file to inspect");
 
   try
   {
diff --git a/compiler/circle-inspect/requires.cmake b/compiler/circle-inspect/requires.cmake
index 362d67cf4..183dfe227 100644
--- a/compiler/circle-inspect/requires.cmake
+++ b/compiler/circle-inspect/requires.cmake
@@ -1,3 +1,4 @@
 require("arser")
+require("foder")
 require("mio-circle04")
 require("safemain")
diff --git a/compiler/circle-inspect/src/Dump.cpp b/compiler/circle-inspect/src/Dump.cpp
index bba5e56c3..aa8fed248 100644
--- a/compiler/circle-inspect/src/Dump.cpp
+++ b/compiler/circle-inspect/src/Dump.cpp
@@ -15,7 +15,9 @@
  */
 
 #include "Dump.h"
-#include "Reader.h"
+
+#include <mio_circle/Helper.h>
+#include <mio_circle/Reader.h>
 
 #include <ostream>
 
@@ -24,7 +26,7 @@ namespace circleinspect
 
 void DumpOperators::run(std::ostream &os, const circle::Model *model)
 {
-  circleinspect::Reader reader(model);
+  mio::circle::Reader reader(model);
 
   const uint32_t subgraph_size = reader.num_subgraph();
 
@@ -50,7 +52,7 @@ void DumpOperators::run(std::ostream &os, const circle::Model *model)
 namespace
 {
 
-const circle::Operator *operator_match_output(circleinspect::Reader &reader, const int32_t tensor)
+const circle::Operator *operator_match_output(mio::circle::Reader &reader, const int32_t tensor)
 {
   auto ops = reader.operators();
 
@@ -58,7 +60,7 @@ const circle::Operator *operator_match_output(circleinspect::Reader &reader, con
   {
     const auto op = ops->Get(i);
 
-    const std::vector<int32_t> &outputs = circleinspect::as_index_vector(op->outputs());
+    const std::vector<int32_t> &outputs = mio::circle::as_index_vector(op->outputs());
 
     for (auto output : outputs)
     {
@@ -69,7 +71,7 @@ const circle::Operator *operator_match_output(circleinspect::Reader &reader, con
   return nullptr;
 }
 
-size_t tensor_buffer_size(circleinspect::Reader &reader, const int32_t tensor_id)
+size_t tensor_buffer_size(mio::circle::Reader &reader, const int32_t tensor_id)
 {
   auto tensors = reader.tensors();
 
@@ -93,7 +95,7 @@ namespace circleinspect
 
 void DumpConv2DWeight::run(std::ostream &os, const circle::Model *model)
 {
-  circleinspect::Reader reader(model);
+  mio::circle::Reader reader(model);
 
   const uint32_t subgraph_size = reader.num_subgraph();
 
@@ -110,7 +112,7 @@ void DumpConv2DWeight::run(std::ostream &os, const circle::Model *model)
 
       if (bc == circle::BuiltinOperator_CONV_2D || bc == circle::BuiltinOperator_DEPTHWISE_CONV_2D)
       {
-        const std::vector<int32_t> &inputs = circleinspect::as_index_vector(op->inputs());
+        const std::vector<int32_t> &inputs = mio::circle::as_index_vector(op->inputs());
         if (inputs.size() < 2)
         {
           throw std::runtime_error("Operator has invalid input");
@@ -147,7 +149,7 @@ void DumpOperatorVersion::run(std::ostream &os, const circle::Model *model)
 {
   std::map<std::string, int32_t> op_version_map;
 
-  circleinspect::Reader reader(model);
+  mio::circle::Reader reader(model);
 
   // This assert is subject to be changed later
   assert(reader.num_subgraph() == 1);
@@ -181,7 +183,7 @@ namespace circleinspect
 
 void DumpTensorDType::run(std::ostream &os, const circle::Model *model)
 {
-  circleinspect::Reader reader(model);
+  mio::circle::Reader reader(model);
 
   const uint32_t subgraph_size = reader.num_subgraph();
 
diff --git a/compiler/circle-inspect/src/Reader.h b/compiler/circle-inspect/src/Reader.h
deleted file mode 100644
index c38ec3990..000000000
--- a/compiler/circle-inspect/src/Reader.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __READER_H__
-#define __READER_H__
-
-#include <mio/circle/schema_generated.h>
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace circleinspect
-{
-
-template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T> *flat_array)
-{
-  std::vector<T> ret(flat_array->Length());
-  for (uint32_t i = 0; i < flat_array->Length(); i++)
-  {
-    ret[i] = flat_array->Get(i);
-  }
-  return ret;
-}
-
-/**
- * @brief Loads Circle file and provides helpers to access attributes
- */
-class Reader
-{
-private:
-  using CircleSubGraphs_t = flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>;
-  using CircleBuffers_t = flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>;
-  using CircleTensors_t = flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>;
-  using CircleOperators_t = flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>;
-
-public:
-  Reader(const circle::Model *model);
-
-  Reader() = delete;
-
-public:
-  const std::vector<const circle::OperatorCode *> &opcodes() { return _op_codes; }
-  const CircleBuffers_t *buffers() { return _buffers; }
-  const CircleTensors_t *tensors() { return _tensors; }
-  const CircleOperators_t *operators() { return _operators; }
-  const std::vector<int32_t> &inputs() const { return _inputs; }
-  const std::vector<int32_t> &outputs() const { return _outputs; }
-
-  uint32_t num_subgraph() const { return _subgraphs->Length(); }
-
-  size_t buffer_info(uint32_t buf_idx, const uint8_t **buff_data);
-  circle::BuiltinOperator builtin_code(const circle::Operator *op) const;
-  std::string opcode_name(const circle::Operator *op) const;
-  std::string tensor_name(const circle::Tensor *tensor) const;
-  std::string tensor_dtype(const circle::Tensor *tensor) const;
-
-public:
-  bool select_subgraph(uint32_t subgraph);
-
-private:
-  const CircleSubGraphs_t *_subgraphs{nullptr};
-  const CircleBuffers_t *_buffers{nullptr};
-  const CircleTensors_t *_tensors{nullptr};
-  const CircleOperators_t *_operators{nullptr};
-
-  std::vector<const circle::OperatorCode *> _op_codes;
-  std::vector<int32_t> _inputs;
-  std::vector<int32_t> _outputs;
-};
-
-} // namespace circleinspect
-
-#endif // __READER_H__
diff --git a/compiler/circle-interpreter/CMakeLists.txt b/compiler/circle-interpreter/CMakeLists.txt
new file mode 100644
index 000000000..d18db3e11
--- /dev/null
+++ b/compiler/circle-interpreter/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(INTERPRETER
+      src/CircleInterpreter.cpp
+   )
+
+add_executable(circle-interpreter ${INTERPRETER})
+target_link_libraries(circle-interpreter PRIVATE arser)
+target_link_libraries(circle-interpreter PRIVATE loco)
+target_link_libraries(circle-interpreter PRIVATE luci_import)
+target_link_libraries(circle-interpreter PRIVATE luci_interpreter)
+target_link_libraries(circle-interpreter PRIVATE safemain)
+target_link_libraries(circle-interpreter PRIVATE vconone)
+
+install(TARGETS circle-interpreter DESTINATION bin)
diff --git a/compiler/circle-interpreter/requires.cmake b/compiler/circle-interpreter/requires.cmake
new file mode 100644
index 000000000..a565df65b
--- /dev/null
+++ b/compiler/circle-interpreter/requires.cmake
@@ -0,0 +1,6 @@
+require("arser")
+require("loco")
+require("luci")
+require("luci-interpreter")
+require("safemain")
+require("vconone")
diff --git a/compiler/circle-interpreter/src/CircleInterpreter.cpp b/compiler/circle-interpreter/src/CircleInterpreter.cpp
new file mode 100644
index 000000000..1d241278d
--- /dev/null
+++ b/compiler/circle-interpreter/src/CircleInterpreter.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arser/arser.h>
+#include <luci/ImporterEx.h>
+#include <luci_interpreter/Interpreter.h>
+#include <vconone/vconone.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <vector>
+#include <string>
+
+namespace
+{
+
+void readDataFromFile(const std::string &filename, char *data, size_t data_size)
+{
+  std::ifstream fs(filename, std::ifstream::binary);
+  if (fs.fail())
+    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
+  if (fs.read(data, data_size).fail())
+    throw std::runtime_error("Failed to read data from file \"" + filename + "\".\n");
+}
+
+void writeDataToFile(const std::string &filename, const char *data, size_t data_size)
+{
+  std::ofstream fs(filename, std::ofstream::binary);
+  if (fs.fail())
+    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
+  if (fs.write(data, data_size).fail())
+  {
+    throw std::runtime_error("Failed to write data to file \"" + filename + "\".\n");
+  }
+}
+
+template <typename NodeT> size_t getTensorSize(const NodeT *node)
+{
+  uint32_t tensor_size = loco::size(node->dtype());
+  for (uint32_t i = 0; i < node->rank(); ++i)
+    tensor_size *= node->dim(i).value();
+  return tensor_size;
+}
+
+void print_version(void)
+{
+  std::cout << "circle-interpreter version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
+} // namespace
+
+/*
+ * @brief CircleInterpreter main
+ *
+ *        Driver to invoke luci-interpreter
+ *
+ */
+int entry(int argc, char **argv)
+{
+  arser::Arser arser("Interpreter driver for circle models");
+
+  arser::Helper::add_version(arser, print_version);
+
+  arser.add_argument("model_path").help("Circle model filepath");
+  arser.add_argument("input_prefix")
+    .help("Input data filepath for circle model. "
+          "n-th input data is read from ${input_prefix}n, "
+          "for example, Add.circle.input0, Add.circle.input1");
+  arser.add_argument("output_prefix")
+    .help("Output data filepath for circle model. "
+          "Output data is written in ${output_file}n, "
+          "for example, Add.circle.output0");
+
+  try
+  {
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cout << err.what() << std::endl;
+    std::cout << arser;
+    return EXIT_FAILURE;
+  }
+
+  const auto filename = arser.get<std::string>("model_path");
+  const auto input_prefix = arser.get<std::string>("input_prefix");
+  const auto output_prefix = arser.get<std::string>("output_prefix");
+
+  // Load model from the file
+  luci::ImporterEx importer;
+  std::unique_ptr<luci::Module> module = importer.importVerifyModule(filename);
+  if (module == nullptr)
+  {
+    std::cerr << "ERROR: Failed to load '" << filename << "'" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  // Create interpreter.
+  luci_interpreter::Interpreter interpreter(module.get());
+
+  // Set input.
+  // Data for n'th input is read from ${input_prefix}n
+  // (ex: Add.circle.input0, Add.circle.input1 ..)
+  const auto input_nodes = loco::input_nodes(module->graph());
+  for (int32_t i = 0; i < input_nodes.size(); i++)
+  {
+    const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[i]);
+    std::vector<char> input_data(getTensorSize(input_node));
+    readDataFromFile(std::string(input_prefix) + std::to_string(i), input_data.data(),
+                     input_data.size());
+    interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+  }
+
+  // Do inference.
+  interpreter.interpret();
+
+  // Get output.
+  const auto output_nodes = loco::output_nodes(module->graph());
+  for (int i = 0; i < module->graph()->outputs()->size(); i++)
+  {
+    const auto *output_node = loco::must_cast<const luci::CircleOutput *>(output_nodes[i]);
+    std::vector<char> output_data(getTensorSize(output_node));
+    interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
+
+    // Output data is written in ${output_file}n
+    // (ex: Add.circle.output0)
+    writeDataToFile(std::string(output_prefix) + std::to_string(i), output_data.data(),
+                    output_data.size());
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/circle-operator-test/CMakeLists.txt b/compiler/circle-operator-test/CMakeLists.txt
new file mode 100644
index 000000000..2ebd533b9
--- /dev/null
+++ b/compiler/circle-operator-test/CMakeLists.txt
@@ -0,0 +1,18 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+get_target_property(ARTIFACTS_PATH testDataGenerator BINARY_DIR)
+get_target_property(CIRCLE_OPERATOR_PATH circle-operator BINARY_DIR)
+set(CIRCLE_OPERATOR_PATH "${CIRCLE_OPERATOR_PATH}/circle-operator")
+
+nnas_find_package(GTest REQUIRED)
+
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+
+GTest_AddTest(circle-operator-test ${TESTS})
+
+set_tests_properties(circle-operator-test
+                     PROPERTIES
+                     ENVIRONMENT "ARTIFACTS_PATH=${ARTIFACTS_PATH};CIRCLE_OPERATOR_PATH=${CIRCLE_OPERATOR_PATH}"
+                     )
diff --git a/compiler/circle-operator-test/README.md b/compiler/circle-operator-test/README.md
new file mode 100644
index 000000000..d07c64d2e
--- /dev/null
+++ b/compiler/circle-operator-test/README.md
@@ -0,0 +1,7 @@
+# circle-operator-test
+
+_circle-operator-test_ provides test of circle-operator tool is working as expected.
+
+Current tests includes
+- input arguments test is working as expected
+- output of this tool is as expected
diff --git a/compiler/circle-operator-test/requires.cmake b/compiler/circle-operator-test/requires.cmake
new file mode 100644
index 000000000..8ad3b8a64
--- /dev/null
+++ b/compiler/circle-operator-test/requires.cmake
@@ -0,0 +1,2 @@
+require("circle-operator")
+require("common-artifacts")
diff --git a/compiler/circle-operator-test/src/circle-operator.test.cpp b/compiler/circle-operator-test/src/circle-operator.test.cpp
new file mode 100644
index 000000000..29c6f3792
--- /dev/null
+++ b/compiler/circle-operator-test/src/circle-operator.test.cpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <vector>
+
+class cirlce_operator_test : public ::testing::Test
+{
+protected:
+  bool initialize(void);
+  bool run(const std::string &command);
+
+protected:
+  bool load(const std::string &file);
+
+protected:
+  std::string _artifacts_path;
+  std::string _circle_operator_path;
+  std::string _result;
+};
+
+bool cirlce_operator_test::initialize(void)
+{
+  char *path = std::getenv("ARTIFACTS_PATH");
+  if (path == nullptr)
+  {
+    std::cerr << "ARTIFACTS_PATH not found" << std::endl;
+    return false;
+  }
+  _artifacts_path = path;
+
+  path = std::getenv("CIRCLE_OPERATOR_PATH");
+  if (path == nullptr)
+  {
+    std::cerr << "ARTIFACTS_BIN_PATH not found" << std::endl;
+    return false;
+  }
+  _circle_operator_path = path;
+
+  return true;
+}
+
+bool cirlce_operator_test::run(const std::string &command)
+{
+  std::vector<char> buffer(260);
+  std::string result = "";
+  std::string cmd_err = command + " 2>&1";
+  FILE *pipe = popen(cmd_err.c_str(), "r");
+  if (!pipe)
+  {
+    return false;
+  }
+  try
+  {
+    while (fgets(&buffer[0], buffer.size(), pipe) != NULL)
+    {
+      result += &buffer[0];
+    }
+  }
+  catch (...)
+  {
+    pclose(pipe);
+    return false;
+  }
+  pclose(pipe);
+  _result = result;
+
+  std::cout << _result << std::endl;
+
+  return true;
+}
+
+bool cirlce_operator_test::load(const std::string &file)
+{
+  std::ifstream tmp(file.c_str());
+  if (tmp.fail())
+    return false;
+
+  std::stringstream buffer;
+  buffer << tmp.rdbuf();
+  _result = buffer.str();
+  return true;
+}
+
+TEST_F(cirlce_operator_test, valid_names)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/Add_000.circle";
+  std::string command = _circle_operator_path + " --name " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ofm");
+  ASSERT_NE(std::string::npos, pos);
+}
+
+TEST_F(cirlce_operator_test, valid_codes)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/Add_000.circle";
+  std::string command = _circle_operator_path + " --code " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ADD");
+  ASSERT_NE(std::string::npos, pos);
+}
+
+TEST_F(cirlce_operator_test, invalid_option_NEG)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/Add_000.circle";
+  std::string command = _circle_operator_path + " --opname " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("Invalid argument");
+  ASSERT_NE(std::string::npos, pos);
+}
+
+TEST_F(cirlce_operator_test, check_code_name)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/Add_000.circle";
+  std::string command = _circle_operator_path + " --code --name " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ofm");
+  ASSERT_NE(std::string::npos, pos);
+  const auto pos2 = _result.find("ADD");
+  ASSERT_NE(std::string::npos, pos2);
+}
+
+TEST_F(cirlce_operator_test, nonexist_file_NEG)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/non_exist_file.foo";
+  std::string command = _circle_operator_path + " --name " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ERROR");
+  ASSERT_NE(std::string::npos, pos);
+}
+
+TEST_F(cirlce_operator_test, invalid_file_NEG)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string model = _artifacts_path + "/Add_000.recipe";
+  std::string command = _circle_operator_path + " --name " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ERROR");
+  ASSERT_NE(std::string::npos, pos);
+}
+
+TEST_F(cirlce_operator_test, output_file)
+{
+  if (!initialize())
+  {
+    FAIL();
+    return;
+  }
+
+  std::string fileName("/tmp/a.txt");
+  std::remove(fileName.c_str());
+  std::string model = _artifacts_path + "/Add_000.circle";
+  std::string command = _circle_operator_path + " --code --output_path " + fileName + " " + model;
+  if (!run(command))
+  {
+    FAIL();
+    return;
+  }
+  if (!load(fileName))
+  {
+    FAIL();
+    return;
+  }
+
+  const auto pos = _result.find("ADD");
+  ASSERT_NE(std::string::npos, pos);
+}
diff --git a/compiler/circle-operator/CMakeLists.txt b/compiler/circle-operator/CMakeLists.txt
new file mode 100644
index 000000000..6817a8618
--- /dev/null
+++ b/compiler/circle-operator/CMakeLists.txt
@@ -0,0 +1,17 @@
+if(NOT TARGET mio_circle04)
+  return()
+endif(NOT TARGET mio_circle04)
+
+set(DRIVER "driver/Driver.cpp")
+
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+add_executable(circle-operator ${DRIVER} ${SOURCES})
+target_include_directories(circle-operator PRIVATE src)
+target_link_libraries(circle-operator arser)
+target_link_libraries(circle-operator foder)
+target_link_libraries(circle-operator mio_circle04)
+target_link_libraries(circle-operator mio_circle04_helper)
+target_link_libraries(circle-operator safemain)
+
+install(TARGETS circle-operator DESTINATION bin)
diff --git a/compiler/circle-operator/README.md b/compiler/circle-operator/README.md
new file mode 100644
index 000000000..86a923f05
--- /dev/null
+++ b/compiler/circle-operator/README.md
@@ -0,0 +1,70 @@
+# circle-operator
+
+_circle-operator_ allows users to retrieve operators information from a Circle model file
+
+NOTE: this tool is primary for ONE-vscode where PartEditor needs names and codes
+of the operators.
+
+## Information with operators
+
+Operators with `--name`
+- show operator names one line at a time in execution order
+
+Example
+```
+$ circle-operator --name model.circle
+```
+
+Result
+```
+conv1_pad/Pad
+conv1_conv/BiasAdd
+pool1_pad/Pad
+```
+
+Operators codes with `--code`
+- show operator codes one line at a time in execution order
+
+Example
+```
+$ circle-operator --code model.circle
+```
+
+Result
+```
+PAD
+CONV_2D
+PAD
+```
+
+Operators with both `--code` and `--name`
+- show operator both codes and name separated with `,` one line at a time in execution order
+
+Example
+```
+$ circle-operator --code --name model.circle
+```
+
+Result
+```
+PAD,conv1_pad/Pad
+CONV_2D,conv1_conv/BiasAdd
+PAD,pool1_pad/Pad
+```
+
+## Save to file
+
+Use `--output_path` to save results to a file.
+
+Example
+```
+$ circle-operator --name --output_path /tmp/result model.circle
+```
+
+Result
+```
+$ cat /tmp/result
+conv1_pad/Pad
+conv1_conv/BiasAdd
+pool1_pad/Pad
+```
diff --git a/compiler/circle-operator/driver/Driver.cpp b/compiler/circle-operator/driver/Driver.cpp
new file mode 100644
index 000000000..f5fd8073c
--- /dev/null
+++ b/compiler/circle-operator/driver/Driver.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dump.h"
+
+#include <arser/arser.h>
+#include <foder/FileLoader.h>
+#include <fstream>
+
+#include <functional>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <signal.h>
+
+void handle_segfault(int signal, siginfo_t *si, void *arg)
+{
+  std::cerr << "ERROR: Failed to load file" << std::endl;
+  exit(255);
+}
+
+int entry(int argc, char **argv)
+{
+  // TODO add option to dump for all sub-graphs
+  arser::Arser arser{
+    "circle-operator allows users to retrieve operator information from a Circle model file"};
+  arser.add_argument("--name").nargs(0).help("Dump operators name in circle file");
+  arser.add_argument("--code").nargs(0).help("Dump operators code in circle file");
+  arser.add_argument("--output_path").help("Save output to file (default output is console)");
+  arser.add_argument("circle").help("Circle file to dump");
+
+  try
+  {
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cerr << err.what() << std::endl;
+    std::cerr << arser;
+    return 255;
+  }
+
+  cirops::DumpOption option;
+  option.names = arser["--name"];
+  option.codes = arser["--code"];
+
+  std::ofstream oFstream;
+  std::ostream *oStream = &std::cout;
+  if (arser["--output_path"])
+  {
+    auto output_path = arser.get<std::string>("--output_path");
+    oFstream.open(output_path, std::ofstream::out | std::ofstream::trunc);
+    if (oFstream.fail())
+    {
+      std::cerr << "ERROR: Failed to create output to file " << output_path << std::endl;
+      return 255;
+    }
+    oStream = &oFstream;
+  }
+
+  // hook segment fault
+  struct sigaction sa;
+  memset(&sa, 0, sizeof(struct sigaction));
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = handle_segfault;
+  sa.sa_flags = SA_SIGINFO;
+  sigaction(SIGSEGV, &sa, NULL);
+
+  std::string modelFile = arser.get<std::string>("circle");
+  // Load Circle model from a circle file
+  try
+  {
+    foder::FileLoader fileLoader{modelFile};
+    std::vector<char> modelData = fileLoader.load();
+    const circle::Model *circleModel = circle::GetModel(modelData.data());
+    if (circleModel == nullptr)
+    {
+      std::cerr << "ERROR: Failed to load circle '" << modelFile << "'" << std::endl;
+      return 255;
+    }
+    cirops::DumpOperators dump;
+    dump.run(*oStream, circleModel, option);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cerr << "ERROR: " << err.what() << std::endl;
+    return 255;
+  }
+
+  if (oFstream.is_open())
+  {
+    oFstream.close();
+  }
+
+  return 0;
+}
diff --git a/compiler/circle-operator/requires.cmake b/compiler/circle-operator/requires.cmake
new file mode 100644
index 000000000..183dfe227
--- /dev/null
+++ b/compiler/circle-operator/requires.cmake
@@ -0,0 +1,4 @@
+require("arser")
+require("foder")
+require("mio-circle04")
+require("safemain")
diff --git a/compiler/circle-operator/src/Dump.cpp b/compiler/circle-operator/src/Dump.cpp
new file mode 100644
index 000000000..36bfe8632
--- /dev/null
+++ b/compiler/circle-operator/src/Dump.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dump.h"
+
+#include <mio_circle/Helper.h>
+#include <mio_circle/Reader.h>
+
+#include <ostream>
+
+namespace
+{
+
+void dump_ops(std::ostream &os, mio::circle::Reader &reader, const cirops::DumpOption &option)
+{
+  auto ops = reader.operators();
+  for (uint32_t i = 0; i < ops->Length(); ++i)
+  {
+    const auto op = ops->Get(i);
+    const auto op_name = reader.opcode_name(op);
+
+    if (option.all_graphs)
+    {
+      // NOTE all_graphs is false for now
+      // TODO check using '$' as split key
+      os << i << "$";
+    }
+
+    if (option.codes)
+    {
+      const auto op_name = reader.opcode_name(op);
+      os << op_name;
+    }
+    if (option.names)
+    {
+      // TODO multiple outputs?
+      const auto tensors = reader.tensors();
+      const auto output_tensors = reader.outputs(op);
+      const auto output = output_tensors.at(0);
+      const auto tensor = tensors->Get(output);
+      const std::string name = mio::circle::tensor_name(tensor);
+      if (option.codes)
+      {
+        os << ",";
+      }
+      os << name;
+    }
+    os << std::endl;
+  }
+}
+
+} // namespace
+
+namespace cirops
+{
+
+void DumpOperators::run(std::ostream &os, const circle::Model *model, const DumpOption &option)
+{
+  mio::circle::Reader reader(model);
+
+  const uint32_t subgraph_size = reader.num_subgraph();
+  for (uint32_t g = 0; g < subgraph_size; g++)
+  {
+    reader.select_subgraph(g);
+    dump_ops(os, reader, option);
+
+    if (!option.all_graphs)
+      break;
+  }
+}
+
+} // namespace cirops
diff --git a/compiler/circle-operator/src/Dump.h b/compiler/circle-operator/src/Dump.h
new file mode 100644
index 000000000..aa1d1be49
--- /dev/null
+++ b/compiler/circle-operator/src/Dump.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DUMP_H__
+#define __DUMP_H__
+
+#include <mio/circle/schema_generated.h>
+
+#include <ostream>
+
+namespace cirops
+{
+
+struct DumpOption
+{
+  bool names = false;
+  bool codes = false;
+  bool all_graphs = false;
+};
+
+class DumpOperators
+{
+public:
+  DumpOperators() = default;
+
+public:
+  void run(std::ostream &os, const circle::Model *model, const DumpOption &option);
+};
+
+} // namespace cirops
+
+#endif // __DUMP_H__
diff --git a/compiler/circle-opselector/driver/Driver.cpp b/compiler/circle-opselector/driver/Driver.cpp
index a1ace4f58..4b39a6ddb 100644
--- a/compiler/circle-opselector/driver/Driver.cpp
+++ b/compiler/circle-opselector/driver/Driver.cpp
@@ -159,26 +159,16 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle-opselector provides selecting operations in circle model");
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
+  arser::Helper::add_version(arser, print_version);
 
   // TODO Add new options!
 
-  arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
-  arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+  arser.add_argument("input").help("Input circle model");
+  arser.add_argument("output").help("Output circle model");
 
   // select option
-  arser.add_argument("--by_id")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Input operation id to select nodes.");
-  arser.add_argument("--by_name")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Input operation name to select nodes.");
+  arser.add_argument("--by_id").help("Input operation id to select nodes.");
+  arser.add_argument("--by_name").help("Input operation name to select nodes.");
 
   try
   {
diff --git a/compiler/circle-part-value-test/CMakeLists.txt b/compiler/circle-part-value-test/CMakeLists.txt
index 0657607d2..ffe1b8909 100644
--- a/compiler/circle-part-value-test/CMakeLists.txt
+++ b/compiler/circle-part-value-test/CMakeLists.txt
@@ -82,7 +82,8 @@ foreach(IDX RANGE ${RECIPE_LENGTH_M1})
 
   # Run partitioner
   add_custom_command(OUTPUT ${PARTITIONER_CONN_JSON}
-    COMMAND circle-partitioner "${PART_FILE}" "${PARTITION_NAME}.circle" "${PARTITIONER_OUTPUT_PATH}"
+    COMMAND circle-partitioner "--part_file" "${PART_FILE}" "--input_file"
+            "${PARTITION_NAME}.circle" "--work_path" "${PARTITIONER_OUTPUT_PATH}"
     DEPENDS circle-partitioner ${PART_DST_PATH} ${CIRCLE_DST_PATH}
     COMMENT "Parition ${RECIPE_NAME}.circle with ${PART_FILE}"
   )
diff --git a/compiler/circle-partitioner-test/CMakeLists.txt b/compiler/circle-partitioner-test/CMakeLists.txt
index e29a66b41..7b26b3ba7 100644
--- a/compiler/circle-partitioner-test/CMakeLists.txt
+++ b/compiler/circle-partitioner-test/CMakeLists.txt
@@ -57,7 +57,8 @@ foreach(IDX RANGE ${RECIPE_LENGTH_M1})
   # Run partitioner
   set(PART_CONN_JSON "${PART_OUT_PATH}/${PART_NAME}.conn.json")
   add_custom_command(OUTPUT ${PART_CONN_JSON}
-    COMMAND circle-partitioner "${PART_FILE}" "${PART_NAME}.circle" "${PART_OUT_PATH}"
+    COMMAND circle-partitioner "--part_file" "${PART_FILE}" "--input_file"
+            "${PART_NAME}.circle" "--work_path" "${PART_OUT_PATH}"
     DEPENDS circle-partitioner ${CIRCLE_DST_PATH} ${PART_DST_PATH}
     COMMENT "Parition ${RECIPE_NAME}.circle with ${PART_FILE}"
   )
diff --git a/compiler/circle-partitioner/CMakeLists.txt b/compiler/circle-partitioner/CMakeLists.txt
index 9b8f5afae..abc5d93fb 100644
--- a/compiler/circle-partitioner/CMakeLists.txt
+++ b/compiler/circle-partitioner/CMakeLists.txt
@@ -1,7 +1,6 @@
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_executable(circle-partitioner "${SOURCES}")
-target_link_libraries(circle-partitioner foder)
 target_link_libraries(circle-partitioner crew)
 target_link_libraries(circle-partitioner safemain)
 target_link_libraries(circle-partitioner luci_lang)
@@ -17,22 +16,3 @@ target_link_libraries(circle-partitioner vconone)
 target_link_libraries(circle-partitioner nncc_common)
 
 install(TARGETS circle-partitioner DESTINATION bin)
-
-# TODO remove circle_partitioner
-add_executable(circle_partitioner "${SOURCES}")
-target_link_libraries(circle_partitioner foder)
-target_link_libraries(circle_partitioner crew)
-target_link_libraries(circle_partitioner safemain)
-target_link_libraries(circle_partitioner luci_lang)
-target_link_libraries(circle_partitioner luci_log)
-target_link_libraries(circle_partitioner luci_import)
-target_link_libraries(circle_partitioner luci_service)
-target_link_libraries(circle_partitioner luci_pass)
-target_link_libraries(circle_partitioner luci_export)
-target_link_libraries(circle_partitioner luci_partition)
-target_link_libraries(circle_partitioner arser)
-target_link_libraries(circle_partitioner pepper_csv2vec)
-target_link_libraries(circle_partitioner vconone)
-target_link_libraries(circle_partitioner nncc_common)
-
-install(TARGETS circle_partitioner DESTINATION bin)
diff --git a/compiler/circle-partitioner/README.md b/compiler/circle-partitioner/README.md
index 2e0a98638..760cf28d1 100644
--- a/compiler/circle-partitioner/README.md
+++ b/compiler/circle-partitioner/README.md
@@ -4,10 +4,10 @@ _circle-partitioner_ provides model partitioning of circle model to two or more
 
 ## How circle-partitioner work
 
-_circle-partitioner_ requires 3 positional arguments
-- first: `partition` file
-- second: `input` circle model file
-- third: `work` folder
+_circle-partitioner_ requires 3 arguments for inputs files
+- `--part_file`: `partition` file, use extension `.part`
+- `--input_file`: `input` circle model file
+- `--work_path`: `work` path where input files reside. this is optional and CWD if omitted
 
 And options to override `partition` file as a helper to try out without editing `partition` file.
 - `--backends`: override `backends` of `[partition]` section
@@ -20,7 +20,7 @@ are read from `work` folder.
 Outputs are (1) one or more partitioned circle models and (2) connection file that gives how
 the partitioned models should be connected to act like the source `input` model.
 
-Why does input files be placed in `work` folder too?
+Why does input files be placed in `work` path too?
 - this is still work in progress condition
 - use cases are still ambigious
 - original `input` model file can be used by the backend, so `.conn` file links it as `source`
@@ -94,7 +94,8 @@ Net_InstanceNorm_003/
 
 Command example
 ```
-./circle-partitioner Net_InstanceNorm_003.part Net_InstanceNorm_003.circle Net_InstanceNorm_003
+./circle-partitioner --part_file Net_InstanceNorm_003.part \
+--input_file Net_InstanceNorm_003.circle --work_path= Net_InstanceNorm_003
 ```
 
 Result of _circle-partitioner_
@@ -171,11 +172,11 @@ Consider partitioning with backends of OneRT
 
 Let's try with this command:
 ```
-circle_partitioner \
-   --partition Net_InstanceNorm_003.part \
-   --backends cpu,acl_cl \
-   --default cpu \
-   Net_InstanceNorm_003.circle Net_InstanceNorm_003
+circle-partitioner \
+   --backends cpu,acl_cl --default cpu \
+   --part_file Net_InstanceNorm_003.part \
+   --input_file Net_InstanceNorm_003.circle \
+   --work_path Net_InstanceNorm_003
 ```
 
 where `Net_InstanceNorm_003.part` is like this for initial design
diff --git a/compiler/circle-partitioner/requires.cmake b/compiler/circle-partitioner/requires.cmake
index 690d9531c..82d9c2b0f 100644
--- a/compiler/circle-partitioner/requires.cmake
+++ b/compiler/circle-partitioner/requires.cmake
@@ -1,4 +1,3 @@
-require("foder")
 require("crew")
 require("pepper-csv2vec")
 require("safemain")
diff --git a/compiler/circle-partitioner/src/CirclePartitioner.cpp b/compiler/circle-partitioner/src/CirclePartitioner.cpp
index 0151e92d3..5cecb9ae0 100644
--- a/compiler/circle-partitioner/src/CirclePartitioner.cpp
+++ b/compiler/circle-partitioner/src/CirclePartitioner.cpp
@@ -18,9 +18,7 @@
 #include "PartitionExport.h"
 #include "HelperPath.h"
 
-#include <foder/FileLoader.h>
-
-#include <luci/Importer.h>
+#include <luci/ImporterEx.h>
 #include <luci/Service/Validate.h>
 #include <luci/CircleExporter.h>
 #include <luci/CircleFileExpContract.h>
@@ -41,9 +39,9 @@ namespace
 
 const char *opt_bks = "--backends";
 const char *opt_def = "--default";
-const char *opt_part = "partition";
-const char *opt_input = "input";
-const char *opt_work = "work";
+const char *opt_part_file = "--part_file";
+const char *opt_input_file = "--input_file";
+const char *opt_work_path = "--work_path";
 
 void print_version(void)
 {
@@ -53,63 +51,25 @@ void print_version(void)
 
 void build_arser(arser::Arser &arser)
 {
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
-
-  arser.add_argument(opt_bks)
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Backends in CSV to use for partitioning");
-
-  arser.add_argument(opt_def)
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Default backend to assign");
-
-  arser.add_argument(opt_part)
-    .nargs(1)
-    .type(arser::DataType::STR)
+  arser::Helper::add_version(arser, print_version);
+
+  arser.add_argument(opt_bks).help("Backends in CSV to use for partitioning");
+
+  arser.add_argument(opt_def).help("Default backend to assign");
+
+  arser.add_argument(opt_part_file)
+    .required(true)
     .help("Partition file which provides backend to assign");
-  arser.add_argument(opt_input)
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Input circle model filename");
-  arser.add_argument(opt_work)
-    .nargs(1)
-    .type(arser::DataType::STR)
+  arser.add_argument(opt_input_file).required(true).help("Input circle model filename");
+  arser.add_argument(opt_work_path)
     .help("Work folder of partition, input files exist and output files are produced");
 }
 
 std::unique_ptr<luci::Module> load_model(const std::string &input_path)
 {
-  // Load model from the file
-  foder::FileLoader file_loader{input_path};
-  std::vector<char> model_data = file_loader.load();
-
-  // Verify flatbuffers
-  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
-  if (!circle::VerifyModelBuffer(verifier))
-  {
-    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
-    return nullptr;
-  }
-
-  const circle::Model *circle_model = circle::GetModel(model_data.data());
-  if (circle_model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
-    return nullptr;
-  }
-
   // Import from input Circle file
-  luci::Importer importer;
-  return importer.importModule(circle_model);
+  luci::ImporterEx importerex;
+  return importerex.importVerifyModule(input_path);
 }
 
 } // namespace
@@ -133,9 +93,14 @@ int entry(int argc, char **argv)
     return EXIT_FAILURE;
   }
 
-  std::string partition_file = arser.get<std::string>(opt_part);
-  std::string input_file = arser.get<std::string>(opt_input);
-  std::string work_folder = arser.get<std::string>(opt_work);
+  std::string partition_file = arser.get<std::string>(opt_part_file);
+  std::string input_file = arser.get<std::string>(opt_input_file);
+  std::string work_folder = ".";
+
+  if (arser[opt_work_path])
+  {
+    work_folder = arser.get<std::string>(opt_work_path);
+  }
 
   std::string partition_path = work_folder + "/" + partition_file;
   std::string input_path = work_folder + "/" + input_file;
diff --git a/compiler/circle-quantizer-dredd-recipe-test/CMakeLists.txt b/compiler/circle-quantizer-dredd-recipe-test/CMakeLists.txt
index 5ec8b6ee5..a3a2902d9 100644
--- a/compiler/circle-quantizer-dredd-recipe-test/CMakeLists.txt
+++ b/compiler/circle-quantizer-dredd-recipe-test/CMakeLists.txt
@@ -18,7 +18,7 @@ unset(TEST_NAMES)
 get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
 set(options USE_QCONFIG)
-set(oneValueArgs DTYPE GRANULARITY)
+set(oneValueArgs DTYPE GRANULARITY INPUT_DTYPE OUTPUT_DTYPE)
 set(multiValueArgs "")
 
 macro(Add RECIPE)
@@ -29,6 +29,16 @@ macro(Add RECIPE)
     set(QCONFIG_OPT "--config" "${ARTIFACTS_BIN_PATH}/${RECIPE}.qconf.json")
   endif()
 
+  set(INPUT_DTYPE_OPT "")
+  if(ARG_INPUT_DTYPE)
+    set(INPUT_DTYPE_OPT "--input_type" "${ARG_INPUT_DTYPE}")
+  endif()
+
+  set(OUTPUT_DTYPE_OPT "")
+  if(ARG_OUTPUT_DTYPE)
+    set(OUTPUT_DTYPE_OPT "--output_type" "${ARG_OUTPUT_DTYPE}")
+  endif()
+
   set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${RECIPE}.circle")
   set(FAKE_QUANT_CIRCLE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE}.fq.circle")
   set(RECORDED_CIRCLE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE}.recorded.circle")
@@ -38,7 +48,10 @@ macro(Add RECIPE)
   add_custom_command(OUTPUT ${QUANT_CIRCLE_PATH}
     COMMAND $<TARGET_FILE:circle-quantizer> --quantize_dequantize_weights float32 ${ARG_DTYPE} ${ARG_GRANULARITY} ${QCONFIG_OPT} ${CIRCLE_PATH} ${FAKE_QUANT_CIRCLE_PATH}
     COMMAND $<TARGET_FILE:record-minmax> --input_model ${FAKE_QUANT_CIRCLE_PATH} --output_model ${RECORDED_CIRCLE_PATH}
-    COMMAND $<TARGET_FILE:circle-quantizer> --quantize_with_minmax float32 ${ARG_DTYPE} ${ARG_GRANULARITY} ${QCONFIG_OPT} ${RECORDED_CIRCLE_PATH} ${QUANT_CIRCLE_PATH}
+    COMMAND $<TARGET_FILE:circle-quantizer>
+      --quantize_with_minmax float32 ${ARG_DTYPE} ${ARG_GRANULARITY}
+      ${QCONFIG_OPT} ${RECORDED_CIRCLE_PATH} ${QUANT_CIRCLE_PATH}
+      ${INPUT_DTYPE_OPT} ${OUTPUT_DTYPE_OPT}
     DEPENDS 
       circle-quantizer
       record-minmax
diff --git a/compiler/circle-quantizer-dredd-recipe-test/test.lst b/compiler/circle-quantizer-dredd-recipe-test/test.lst
index 188103016..58f89c767 100644
--- a/compiler/circle-quantizer-dredd-recipe-test/test.lst
+++ b/compiler/circle-quantizer-dredd-recipe-test/test.lst
@@ -6,10 +6,75 @@
 
 ## TFLITE RECIPE
 
+# MPQ Test (default: u8, target: s16)
+Add(Quant_Add_001 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_AveragePool2D_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_BatchMatMul_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Concatenation_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Conv_003 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_DepthwiseConv2D_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_FullyConnected_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_LeakyRelu_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Logistic_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_MaxPool2D_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Mean_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Mul_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Neg_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Pad_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_PRelu_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ReLU_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ReLU6_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Reshape_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ResizeBilinear_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ResizeNearestNeighbor_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Slice_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Softmax_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Tanh_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Transpose_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_TransposeConv_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+
+# MPQ Test (default: s16, target: u8)
+Add(Quant_Add_002 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_AveragePool2D_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_BatchMatMul_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Concatenation_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Conv_004 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_DepthwiseConv2D_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_FullyConnected_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_LeakyRelu_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Logistic_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_MaxPool2D_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Mean_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Mul_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Neg_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Pad_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_PRelu_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ReLU_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ReLU6_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Reshape_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ResizeBilinear_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_ResizeNearestNeighbor_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Slice_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Softmax_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Tanh_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Transpose_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+Add(Quant_TransposeConv_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
+
 Add(Quant_Conv_Mul_Add_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
 Add(Quant_Conv_Mul_Add_001 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
 Add(Quant_Conv_Mul_Add_002 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
 Add(Quant_Split_Add_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
 Add(Quant_Split_Add_001 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+Add(Quant_Conv_000 DTYPE uint8 GRANULARITY channel INPUT_DTYPE float32)
+Add(Quant_Conv_001 DTYPE uint8 GRANULARITY channel OUTPUT_DTYPE float32)
+Add(Quant_Conv_002 DTYPE uint8 GRANULARITY channel INPUT_DTYPE float32 OUTPUT_DTYPE float32)
 
 AddFakeQuant(Quant_Add_000)
+
+## CIRCLE RECIPE
+
+# MPQ Test (default: u8, target: s16)
+Add(Quant_InstanceNorm_000 DTYPE uint8 GRANULARITY channel USE_QCONFIG)
+
+# MPQ Test (default: s16, target: u8)
+Add(Quant_InstanceNorm_001 DTYPE int16 GRANULARITY channel USE_QCONFIG)
diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 14e00972b..16e41a327 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -10,7 +10,6 @@ add_executable(circle-quantizer "${SOURCES}")
 target_include_directories(circle-quantizer PRIVATE ${Jsoncpp_INCLUDE_DIRS})
 
 target_link_libraries(circle-quantizer ${Jsoncpp_STATIC_LIB})
-target_link_libraries(circle-quantizer foder)
 target_link_libraries(circle-quantizer safemain)
 target_link_libraries(circle-quantizer oops)
 target_link_libraries(circle-quantizer loco)
diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
index c21e28e8d..4fcee1873 100644
--- a/compiler/circle-quantizer/requires.cmake
+++ b/compiler/circle-quantizer/requires.cmake
@@ -1,4 +1,3 @@
-require("foder")
 require("loco")
 require("locop")
 require("safemain")
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index e0c85cb6e..f1e31ed8d 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <foder/FileLoader.h>
-
-#include <luci/Importer.h>
+#include <luci/ImporterEx.h>
 #include <luci/CircleQuantizer.h>
 #include <luci/Service/Validate.h>
 #include <luci/CircleExporter.h>
@@ -59,13 +57,31 @@ std::vector<std::shared_ptr<LayerParam>> read_layer_params(std::string &filename
   std::vector<std::shared_ptr<LayerParam>> p;
   for (auto layer : layers)
   {
-    auto l = std::make_shared<LayerParam>();
+    if (layer.isMember("name"))
     {
-      l->name = layer["name"].asString();
-      l->dtype = layer["dtype"].asString();
-      l->granularity = layer["granularity"].asString();
+      auto l = std::make_shared<LayerParam>();
+      {
+        l->name = layer["name"].asString();
+        l->dtype = layer["dtype"].asString();
+        l->granularity = layer["granularity"].asString();
+      }
+      p.emplace_back(l);
+    }
+
+    // Multiple names with the same dtype & granularity
+    if (layer.isMember("names"))
+    {
+      for (auto name : layer["names"])
+      {
+        auto l = std::make_shared<LayerParam>();
+        {
+          l->name = name.asString();
+          l->dtype = layer["dtype"].asString();
+          l->granularity = layer["granularity"].asString();
+        }
+        p.emplace_back(l);
+      }
     }
-    p.emplace_back(l);
   }
 
   return p;
@@ -109,23 +125,12 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle-quantizer provides circle model quantization");
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
-
-  arser.add_argument("-V", "--verbose")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("output additional information to stdout or stderr");
+  arser::Helper::add_version(arser, print_version);
+  arser::Helper::add_verbose(arser);
 
   arser.add_argument(qdqw)
     .nargs(3)
     .type(arser::DataType::STR_VEC)
-    .required(false)
     .help("Quantize-dequantize weight values required action before quantization. "
           "Three arguments required: input_model_dtype(float32) "
           "output_model_dtype(uint8) granularity(layer, channel)");
@@ -133,28 +138,24 @@ int entry(int argc, char **argv)
   arser.add_argument(qwmm)
     .nargs(3)
     .type(arser::DataType::STR_VEC)
-    .required(false)
     .help("Quantize with min/max values. "
           "Three arguments required: input_model_dtype(float32) "
           "output_model_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument(tf_maxpool)
     .nargs(0)
-    .required(false)
     .default_value(false)
     .help("Force MaxPool Op to have the same input/output quantparams. NOTE: This feature can "
           "degrade accuracy of some models");
 
   arser.add_argument(fake_quant)
     .nargs(0)
-    .required(false)
     .help("Convert a quantized model to a fake-quantized model. NOTE: This feature will "
           "generate an fp32 model.");
 
   arser.add_argument(rq)
     .nargs(2)
     .type(arser::DataType::STR_VEC)
-    .required(false)
     .help("Requantize a quantized model. "
           "Two arguments required: input_model_dtype(int8) "
           "output_model_dtype(uint8)");
@@ -162,7 +163,6 @@ int entry(int argc, char **argv)
   arser.add_argument(fq)
     .nargs(3)
     .type(arser::DataType::STR_VEC)
-    .required(false)
     .accumulated(true)
     .help("Write quantization parameters to the specified tensor. "
           "Three arguments required: tensor_name(string), "
@@ -171,32 +171,21 @@ int entry(int argc, char **argv)
   arser.add_argument(cq)
     .nargs(2)
     .type(arser::DataType::STR_VEC)
-    .required(false)
     .accumulated(true)
     .help("Copy quantization parameter from a tensor to another tensor."
           "Two arguments required: source_tensor_name(string), "
           "destination_tensor_name(string)");
 
   arser.add_argument("--input_type")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Input type of quantized model (uint8 or int16)");
+    .help("Input type of quantized model (uint8, int16, or float32)");
 
   arser.add_argument("--output_type")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Output type of quantized model (uint8 or int16)");
+    .help("Output type of quantized model (uint8, int16, or float32)");
 
-  arser.add_argument(cfg)
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Path to the quantization configuration file");
+  arser.add_argument(cfg).help("Path to the quantization configuration file");
 
-  arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
-  arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+  arser.add_argument("input").help("Input circle model");
+  arser.add_argument("output").help("Output circle model");
 
   arser.add_argument(gpd).nargs(0).required(false).default_value(false).help(
     "This will turn on profiling data generation.");
@@ -384,27 +373,10 @@ int entry(int argc, char **argv)
     settings->set(luci::UserSettings::Key::ProfilingDataGen, true);
 
   // Load model from the file
-  foder::FileLoader file_loader{input_path};
-  std::vector<char> model_data = file_loader.load();
-
-  // Verify flatbuffers
-  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
-  if (!circle::VerifyModelBuffer(verifier))
-  {
-    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  const circle::Model *circle_model = circle::GetModel(model_data.data());
-  if (circle_model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+  luci::ImporterEx importerex;
+  auto module = importerex.importVerifyModule(input_path);
+  if (module.get() == nullptr)
     return EXIT_FAILURE;
-  }
-
-  // Import from input Circle file
-  luci::Importer importer;
-  auto module = importer.importModule(circle_model);
 
   for (size_t idx = 0; idx < module->size(); ++idx)
   {
diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
index 70f3c8d84..c32dc3f5a 100644
--- a/compiler/circle-tensordump/driver/Driver.cpp
+++ b/compiler/circle-tensordump/driver/Driver.cpp
@@ -31,11 +31,9 @@ int entry(int argc, char **argv)
   arser::Arser arser{
     "circle-tensordump allows users to retrieve tensor information from a Circle model file"};
 
-  arser.add_argument("circle").nargs(1).type(arser::DataType::STR).help("Circle file path to dump");
+  arser.add_argument("circle").help("Circle file path to dump");
   arser.add_argument("--tensors").nargs(0).help("Dump to console");
   arser.add_argument("--tensors_to_hdf5")
-    .nargs(1)
-    .type(arser::DataType::STR)
     .help("Dump to hdf5 file. Specify hdf5 file path to be dumped");
 
   try
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index e477a7417..49afa73df 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -15,7 +15,8 @@
  */
 
 #include "Dump.h"
-#include "Reader.h"
+
+#include <mio_circle/Reader.h>
 
 #include <H5Cpp.h>
 
@@ -102,7 +103,7 @@ namespace circletensordump
 
 void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::string &)
 {
-  circletensordump::Reader reader(model);
+  mio::circle::Reader reader(model);
   uint32_t num_subgraph = reader.num_subgraph();
   auto buffers = reader.buffers();
 
@@ -296,7 +297,7 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
                             const std::string &output_path)
 {
   // loads a circle model
-  circletensordump::Reader reader(model);
+  mio::circle::Reader reader(model);
   uint32_t num_subgraph = reader.num_subgraph();
 
   // create a hdf5 file
diff --git a/compiler/circle-tensordump/src/Reader.cpp b/compiler/circle-tensordump/src/Reader.cpp
deleted file mode 100644
index 47b876054..000000000
--- a/compiler/circle-tensordump/src/Reader.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Reader.h"
-
-#include <mio_circle/Helper.h>
-
-#include <sstream>
-#include <string>
-
-namespace circletensordump
-{
-
-Reader::Reader(const circle::Model *model)
-{
-  _subgraphs = model->subgraphs();
-  _buffers = model->buffers();
-
-  auto opcodes = model->operator_codes();
-  for (const ::circle::OperatorCode *opcode : *opcodes)
-  {
-    _op_codes.push_back(opcode);
-  }
-}
-
-size_t Reader::buffer_info(uint32_t buf_idx, const uint8_t **buff_data)
-{
-  if (buff_data != nullptr)
-  {
-    *buff_data = nullptr;
-  }
-
-  if (buf_idx == 0)
-    return 0;
-
-  if (auto *buffer = (*_buffers)[buf_idx])
-  {
-    if (auto *array = buffer->data())
-    {
-      if (size_t size = array->size())
-      {
-        if (buff_data != nullptr)
-        {
-          *buff_data = reinterpret_cast<const uint8_t *>(array->data());
-        }
-        return size;
-      }
-    }
-  }
-
-  return 0;
-}
-
-circle::BuiltinOperator Reader::builtin_code(const circle::Operator *op) const
-{
-  uint32_t index = op->opcode_index();
-  assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
-
-  return mio::circle::builtin_code_neutral(opcode);
-}
-
-std::string Reader::opcode_name(const circle::Operator *op) const
-{
-  uint32_t index = op->opcode_index();
-  assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
-
-  if (!mio::circle::is_valid(opcode))
-  {
-    std::ostringstream oss;
-    oss << "(invalid: " << index << ")";
-    return oss.str();
-  }
-
-  return mio::circle::opcode_name(opcode);
-}
-
-bool Reader::select_subgraph(uint32_t sgindex)
-{
-  _tensors = nullptr;
-  _operators = nullptr;
-
-  _inputs.clear();
-  _outputs.clear();
-
-  if (_subgraphs->Length() <= sgindex)
-  {
-    assert(false);
-    return false;
-  }
-
-  const circle::SubGraph *subgraph = (*_subgraphs)[sgindex];
-
-  _tensors = subgraph->tensors();
-  _operators = subgraph->operators();
-
-  _inputs = as_index_vector(subgraph->inputs());
-  _outputs = as_index_vector(subgraph->outputs());
-
-  return true;
-}
-
-} // namespace circletensordump
diff --git a/compiler/circle-tensordump/src/Reader.h b/compiler/circle-tensordump/src/Reader.h
deleted file mode 100644
index c868bc277..000000000
--- a/compiler/circle-tensordump/src/Reader.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CIRCLE_TENSORDUMP_READER_H__
-#define __CIRCLE_TENSORDUMP_READER_H__
-
-#include <mio/circle/schema_generated.h>
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace circletensordump
-{
-
-template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T> *flat_array)
-{
-  std::vector<T> ret(flat_array->Length());
-  for (uint32_t i = 0; i < flat_array->Length(); i++)
-  {
-    ret[i] = flat_array->Get(i);
-  }
-  return ret;
-}
-
-/**
- * @brief Loads Circle file and provides helpers to access attributes
- */
-class Reader
-{
-private:
-  using CircleSubGraphs_t = flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>;
-  using CircleBuffers_t = flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>;
-  using CircleTensors_t = flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>;
-  using CircleOperators_t = flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>;
-
-public:
-  Reader(const circle::Model *model);
-
-  Reader() = delete;
-
-public:
-  const std::vector<const circle::OperatorCode *> &opcodes() { return _op_codes; }
-  const CircleBuffers_t *buffers() { return _buffers; }
-  const CircleTensors_t *tensors() { return _tensors; }
-  const CircleOperators_t *operators() { return _operators; }
-  const std::vector<int32_t> &inputs() const { return _inputs; }
-  const std::vector<int32_t> &outputs() const { return _outputs; }
-
-  uint32_t num_subgraph() const { return _subgraphs->Length(); }
-
-  size_t buffer_info(uint32_t buf_idx, const uint8_t **buff_data);
-  circle::BuiltinOperator builtin_code(const circle::Operator *op) const;
-  std::string opcode_name(const circle::Operator *op) const;
-
-public:
-  bool select_subgraph(uint32_t subgraph);
-
-private:
-  const CircleSubGraphs_t *_subgraphs{nullptr};
-  const CircleBuffers_t *_buffers{nullptr};
-  const CircleTensors_t *_tensors{nullptr};
-  const CircleOperators_t *_operators{nullptr};
-
-  std::vector<const circle::OperatorCode *> _op_codes;
-  std::vector<int32_t> _inputs;
-  std::vector<int32_t> _outputs;
-};
-
-} // namespace circletensordump
-
-#endif // __CIRCLE_TENSORDUMP_READER_H__
diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
index 7a44c65b9..c3a414701 100644
--- a/compiler/circle-verify/src/Driver.cpp
+++ b/compiler/circle-verify/src/Driver.cpp
@@ -25,7 +25,7 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("circle").type(arser::DataType::STR).help("Circle file path to verify");
+  arser.add_argument("circle").help("Circle file path to verify");
 
   try
   {
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index f41aac303..a6f2786d2 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -31,6 +31,8 @@ Add(Net_TConv_Add_002 PASS fuse_add_with_tconv)
 Add(Net_TConv_BN_000 PASS fuse_batchnorm_with_tconv)
 Add(Net_TConv_BN_001 PASS fuse_batchnorm_with_tconv)
 Add(Net_TConv_BN_002 PASS fuse_batchnorm_with_tconv)
+Add(Net_TConv_BN_003 PASS fuse_batchnorm_with_tconv)
+Add(Net_TConv_BN_004 PASS fuse_batchnorm_with_tconv)
 Add(Net_InstanceNorm_001 PASS fuse_instnorm)
 Add(Net_InstanceNorm_003 PASS fuse_instnorm)
 Add(Net_InstanceNorm_004 PASS fuse_instnorm)
@@ -46,6 +48,7 @@ Add(StridedSlice_003 PASS substitute_strided_slice_to_reshape)
 Add(MaxPoolWithArgmax_000 PASS resolve_customop_max_pool_with_argmax)
 Add(MaxPoolWithArgmax_001 PASS resolve_customop_max_pool_with_argmax)
 Add(MaxPoolWithArgmax_002 PASS resolve_customop_max_pool_with_argmax)
+Add(FullyConnected_007 PASS replace_non_const_fc_with_batch_matmul)
 
 ## CIRCLE RECIPE
 
diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
index cd79967b7..dbe485b9f 100644
--- a/compiler/circle2circle/CMakeLists.txt
+++ b/compiler/circle2circle/CMakeLists.txt
@@ -4,7 +4,6 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 
 add_executable(circle2circle "${SOURCES}")
 target_include_directories(circle2circle PRIVATE src)
-target_link_libraries(circle2circle foder)
 target_link_libraries(circle2circle nncc_common)
 target_link_libraries(circle2circle safemain)
 target_link_libraries(circle2circle oops)
@@ -29,7 +28,6 @@ nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(circle2circle_test ${TESTS} ${SOURCES})
 target_include_directories(circle2circle_test PRIVATE src)
-target_link_libraries(circle2circle_test foder)
 target_link_libraries(circle2circle_test nncc_common)
 target_link_libraries(circle2circle_test oops)
 target_link_libraries(circle2circle_test hermes)
diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
index b6c61198f..4e5ed0dd1 100644
--- a/compiler/circle2circle/requires.cmake
+++ b/compiler/circle2circle/requires.cmake
@@ -1,4 +1,3 @@
-require("foder")
 require("loco")
 require("locop")
 require("logo-core")
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index ae677a321..f5cf0d782 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <foder/FileLoader.h>
-
-#include <luci/Importer.h>
+#include <luci/ImporterEx.h>
 #include <luci/CircleOptimizer.h>
 #include <luci/Service/ChangeOutputs.h>
 #include <luci/Service/Validate.h>
@@ -54,6 +52,11 @@ void csv_tokenize(const std::string &data, std::vector<std::string> &result)
     result.push_back(token);
 }
 
+void add_switch(arser::Arser &arser, const char *opt, const char *desc)
+{
+  arser.add_argument(opt).nargs(0).default_value(false).help(desc);
+}
+
 int entry(int argc, char **argv)
 {
   // Simple argument parser (based on map)
@@ -64,368 +67,125 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle2circle provides circle model optimization and transformations");
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
-
-  arser.add_argument("-V", "--verbose")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("output additional information to stdout or stderr");
-
-  arser.add_argument("--O1").nargs(0).required(false).default_value(false).help(
-    "Enable O1 optimize options");
-
-  arser.add_argument("--fold_add_v2")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold AddV2 operators with constant inputs");
-
-  arser.add_argument("--fold_cast")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold Cast operators with constant input");
-
-  arser.add_argument("--fold_dequantize")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold dequantize op");
-
-  arser.add_argument("--fold_dwconv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold Depthwise Convolution operator with constant inputs");
-
-  arser.add_argument("--fold_gather")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold Gather operator");
-
-  arser.add_argument("--fold_sparse_to_dense")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fold SparseToDense operator");
-
-  arser.add_argument("--forward_reshape_to_unaryop")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will move Reshape after UnaryOp for centain condition");
-
-  arser.add_argument("--fuse_activation_function")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse Activation function to a preceding operator");
-
-  arser.add_argument("--fuse_add_with_fully_connected")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse Add operator to FullyConnected operator");
-
-  arser.add_argument("--fuse_add_with_tconv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse Add operator to Transposed Convolution operator");
-
-  arser.add_argument("--fuse_batchnorm_with_conv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse BatchNorm operators to Convolution operator");
-
-  arser.add_argument("--fuse_batchnorm_with_dwconv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse BatchNorm operators to Depthwise Convolution operator");
-
-  arser.add_argument("--fuse_batchnorm_with_tconv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse BatchNorm operators to Transposed Convolution operator");
-
-  arser.add_argument("--fuse_bcq")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse operators and apply Binary Coded Quantization");
-
-  arser.add_argument("--fuse_instnorm")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse operators to InstanceNorm operator");
-
-  arser.add_argument("--fuse_mean_with_mean")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse two Mean operations when they follow one by one."
-          "This will fold them into one operation and merge reduction indices.");
-
-  arser.add_argument("--fuse_transpose_with_mean")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse Mean operation with a preceding Transpose under certain conditions.");
-
-  arser.add_argument("--make_batchnorm_gamma_positive")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will make negative gamma of BatchNorm into a small positive value (1e-10). Note "
-          "that this pass can change the execution result of the model. So, use it only when the "
-          "impact is known to be acceptable.");
-
-  arser.add_argument("--fuse_preactivation_batchnorm")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse BatchNorm operators of pre-activations to Convolution operator");
-
-  arser.add_argument("--remove_fakequant")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove FakeQuant operators");
-
-  arser.add_argument("--remove_quantdequant")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove Quantize-Dequantize sequence");
-
-  arser.add_argument("--remove_redundant_quantize")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove redundant Quantize operators");
-
-  arser.add_argument("--remove_redundant_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse or remove subsequent Reshape operators");
-
-  arser.add_argument("--remove_redundant_transpose")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will fuse or remove subsequent Transpose operators");
-
-  arser.add_argument("--remove_unnecessary_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove unnecessary reshape operators");
-
-  arser.add_argument("--remove_unnecessary_slice")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove unnecessary slice operators");
-
-  arser.add_argument("--remove_unnecessary_strided_slice")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove unnecessary strided slice operators");
-
-  arser.add_argument("--remove_unnecessary_split")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will remove unnecessary split operators");
-
-  arser.add_argument("--replace_cw_mul_add_with_depthwise_conv")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will replace channel-wise mul/add with DepthwiseConv2D operator");
-
-  arser.add_argument("--replace_sub_with_add")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will replace sub with add operator");
-
-  arser.add_argument("--resolve_customop_add")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert Custom(Add) to Add operator");
-
-  arser.add_argument("--resolve_customop_batchmatmul")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert Custom(BatchMatmul) to BatchMatmul operator");
-
-  arser.add_argument("--resolve_customop_matmul")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert Custom(Matmul) to Matmul operator");
-
-  arser.add_argument("--resolve_customop_max_pool_with_argmax")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert Custom(MaxPoolWithArgmax) to equivalent set of operators");
-
-  arser.add_argument("--shuffle_weight_to_16x1float32")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32. Note that "
-          "it only converts weights whose row is a multiple of 16");
-
-  arser.add_argument("--substitute_pack_to_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert single input Pack to Reshape");
-
-  arser.add_argument("--substitute_padv2_to_pad")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert certain condition PadV2 to Pad");
-
-  arser.add_argument("--substitute_splitv_to_split")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert certain condition SplitV to Split operator");
-
-  arser.add_argument("--substitute_squeeze_to_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert certain condition Squeeze to Reshape");
-
-  arser.add_argument("--substitute_strided_slice_to_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert certain condition Strided_Slice to Reshape");
-
-  arser.add_argument("--substitute_transpose_to_reshape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will convert single input Transpose to Reshape");
-
-  arser.add_argument("--expand_broadcast_const")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will expand broadcastable constant inputs");
-
-  arser.add_argument("--convert_nchw_to_nhwc")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Experimental: This will convert NCHW operators to NHWC under the assumption that "
-          "input model is NCHW.");
-
-  arser.add_argument("--nchw_to_nhwc_input_shape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Convert the input shape of the model (argument for --convert_nchw_to_nhwc).");
-
-  arser.add_argument("--nchw_to_nhwc_output_shape")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Convert the output shape of the model (argument for --convert_nchw_to_nhwc).");
-
-  arser.add_argument("--transform_min_max_to_relu6")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Transform Minimum(6)-Maximum(0) pattern to Relu6 operator");
-
-  arser.add_argument("--transform_min_relu_to_relu6")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Transform Minimum(6)-Relu pattern to Relu6 operator");
-
-  arser.add_argument("--mute_warnings")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will turn off warning messages");
-
-  arser.add_argument("--disable_validation")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will turn off operator validations. May help input model investigation.");
-
-  arser.add_argument("--generate_profile_data")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("This will turn on profiling data generation.");
+  arser::Helper::add_version(arser, print_version);
+  arser::Helper::add_verbose(arser);
+
+  add_switch(arser, "--fold_add_v2", "This will fold AddV2 operators with constant inputs");
+  add_switch(arser, "--fold_cast", "This will fold Cast operators with constant input");
+  add_switch(arser, "--fold_densify",
+             "This will fold Densify operators with sparse constant input");
+  add_switch(arser, "--fold_dequantize", "This will fold dequantize op");
+  add_switch(arser, "--fold_dwconv",
+             "This will fold Depthwise Convolution operator with constant inputs");
+  add_switch(arser, "--fold_gather", "This will fold Gather operator");
+  add_switch(arser, "--fold_sparse_to_dense", "This will fold SparseToDense operator");
+  add_switch(arser, "--forward_reshape_to_unaryop",
+             "This will move Reshape after UnaryOp for centain condition");
+  add_switch(arser, "--fuse_activation_function",
+             "This will fuse Activation function to a preceding operator");
+  add_switch(arser, "--fuse_add_with_fully_connected",
+             "This will fuse Add operator to FullyConnected operator");
+  add_switch(arser, "--fuse_add_with_tconv",
+             "This will fuse Add operator to Transposed Convolution operator");
+  add_switch(arser, "--fuse_batchnorm_with_conv",
+             "This will fuse BatchNorm operators to Convolution operator");
+  add_switch(arser, "--fuse_batchnorm_with_dwconv",
+             "This will fuse BatchNorm operators to Depthwise Convolution operator");
+  add_switch(arser, "--fuse_batchnorm_with_tconv",
+             "This will fuse BatchNorm operators to Transposed Convolution operator");
+  add_switch(arser, "--fuse_bcq", "This will fuse operators and apply Binary Coded Quantization");
+  add_switch(arser, "--fuse_instnorm", "This will fuse operators to InstanceNorm operator");
+  add_switch(arser, "--fuse_mean_with_mean",
+             "This will fuse two Mean operations when they follow one by one. This will fold them "
+             "into one operation and merge reduction indices.");
+  add_switch(arser, "--fuse_transpose_with_mean",
+             "This will fuse Mean operation with a preceding Transpose under certain conditions.");
+  add_switch(arser, "--make_batchnorm_gamma_positive",
+             "This will make negative gamma of BatchNorm into a small positive value (1e-10). "
+             "Note that this pass can change the execution result of the model. So, use it only "
+             "when the impact is known to be acceptable.");
+  add_switch(arser, "--fuse_preactivation_batchnorm",
+             "This will fuse BatchNorm operators of pre-activations to Convolution operator");
+  add_switch(arser, "--remove_fakequant", "This will remove FakeQuant operators");
+  add_switch(arser, "--remove_quantdequant", "This will remove Quantize-Dequantize sequence");
+  add_switch(arser, "--remove_redundant_quantize", "This will remove redundant Quantize operators");
+  add_switch(arser, "--remove_redundant_reshape",
+             "This will fuse or remove subsequent Reshape operators");
+  add_switch(arser, "--remove_redundant_transpose",
+             "This will fuse or remove subsequent Transpose operators");
+  add_switch(arser, "--remove_unnecessary_reshape",
+             "This will remove unnecessary reshape operators");
+  add_switch(arser, "--remove_unnecessary_slice", "This will remove unnecessary slice operators");
+  add_switch(arser, "--remove_unnecessary_strided_slice",
+             "This will remove unnecessary strided slice operators");
+  add_switch(arser, "--remove_unnecessary_split", "This will remove unnecessary split operators");
+  add_switch(arser, "--replace_cw_mul_add_with_depthwise_conv",
+             "This will replace channel-wise mul/add with DepthwiseConv2D operator");
+  add_switch(arser, "--replace_sub_with_add", "This will replace sub with add operator");
+  add_switch(arser, "--resolve_customop_add", "This will convert Custom(Add) to Add operator");
+  add_switch(arser, "--resolve_customop_batchmatmul",
+             "This will convert Custom(BatchMatmul) to BatchMatmul operator");
+  add_switch(arser, "--resolve_customop_matmul",
+             "This will convert Custom(Matmul) to Matmul operator");
+  add_switch(arser, "--resolve_customop_max_pool_with_argmax",
+             "This will convert Custom(MaxPoolWithArgmax) to equivalent set of operators");
+  add_switch(arser, "--resolve_customop_splitv",
+             "This will convert Custom(SplitV) to SplitV operator");
+  add_switch(arser, "--shuffle_weight_to_16x1float32",
+             "This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32. Note that "
+             "it only converts weights whose row is a multiple of 16");
+  add_switch(arser, "--replace_non_const_fc_with_batch_matmul",
+             "Replace FullyConnected with BatchMatMul when its weight is non-constant");
+  add_switch(arser, "--substitute_pack_to_reshape",
+             "This will convert single input Pack to Reshape");
+  add_switch(arser, "--substitute_padv2_to_pad",
+             "This will convert certain condition PadV2 to Pad");
+  add_switch(arser, "--substitute_splitv_to_split",
+             "This will convert certain condition SplitV to Split operator");
+  add_switch(arser, "--substitute_squeeze_to_reshape",
+             "This will convert certain condition Squeeze to Reshape");
+  add_switch(arser, "--substitute_strided_slice_to_reshape",
+             "This will convert certain condition Strided_Slice to Reshape");
+  add_switch(arser, "--substitute_transpose_to_reshape",
+             "This will convert single input Transpose to Reshape");
+  add_switch(arser, "--expand_broadcast_const", "This will expand broadcastable constant inputs");
+  add_switch(arser, "--convert_nchw_to_nhwc",
+             "Experimental: This will convert NCHW operators to NHWC under the assumption that "
+             "input model is NCHW.");
+  add_switch(arser, "--nchw_to_nhwc_input_shape",
+             "Convert the input shape of the model (argument for --convert_nchw_to_nhwc).");
+  add_switch(arser, "--nchw_to_nhwc_output_shape",
+             "Convert the output shape of the model (argument for --convert_nchw_to_nhwc).");
+  add_switch(arser, "--transform_min_max_to_relu6",
+             "Transform Minimum(6)-Maximum(0) pattern to Relu6 operator");
+  add_switch(arser, "--transform_min_relu_to_relu6",
+             "Transform Minimum(6)-Relu pattern to Relu6 operator");
+  add_switch(arser, "--mute_warnings", "This will turn off warning messages");
+  add_switch(arser, "--disable_validation",
+             "This will turn off operator validations. May help input model investigation.");
+  add_switch(arser, "--generate_profile_data", "This will turn on profiling data generation.");
 
   arser.add_argument("--change_outputs")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .help("Experimental: Change first subgraph output nodes to CSV names");
 
-  arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
-  arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+  arser.add_argument("input").help("Input circle model");
+  arser.add_argument("output").help("Output circle model");
 
   // sparsification argument
-  arser.add_argument("--sparsify_tensor")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Tensor name that you want to sparsify");
+  arser.add_argument("--sparsify_tensor").help("Tensor name that you want to sparsify");
 
   arser.add_argument("--sparsify_traversal_order")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .default_value("0,1,2,3")
     .help("Traversal order of dimensions. Default value: 0,1,2,3");
 
   arser.add_argument("--sparsify_format")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .default_value("d,s")
     .help("Format of each dimension. 'd' stands for dense, 's' stands for sparse(CSR). Default "
           "value: d,s");
 
-  arser.add_argument("--sparsify_block_size")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
-    .help("Size of each block dimension");
+  arser.add_argument("--sparsify_block_size").help("Size of each block dimension");
 
   arser.add_argument("--sparsify_block_map")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .default_value("0,1")
     .help("Map from block dimension to the original tensor dimension. Default value: 0,1");
 
@@ -446,20 +206,12 @@ int entry(int argc, char **argv)
     // If REPLACE is zero, it does not overwrite an existing value.
     setenv("LUCI_LOG", "100", 0);
   }
-  if (arser.get<bool>("--O1"))
-  {
-    options->enable(Algorithms::FuseBCQ);
-    options->enable(Algorithms::FuseInstanceNorm);
-    options->enable(Algorithms::ResolveCustomOpAdd);
-    options->enable(Algorithms::ResolveCustomOpBatchMatMul);
-    options->enable(Algorithms::ResolveCustomOpMatMul);
-    options->enable(Algorithms::RemoveRedundantTranspose);
-    options->enable(Algorithms::SubstitutePackToReshape);
-  }
   if (arser.get<bool>("--fold_add_v2"))
     options->enable(Algorithms::FoldAddV2);
   if (arser.get<bool>("--fold_cast"))
     options->enable(Algorithms::FoldCast);
+  if (arser.get<bool>("--fold_densify"))
+    options->enable(Algorithms::FoldDensify);
   if (arser.get<bool>("--fold_dequantize"))
     options->enable(Algorithms::FoldDequantize);
   if (arser.get<bool>("--fold_dwconv"))
@@ -524,8 +276,12 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::ResolveCustomOpMatMul);
   if (arser.get<bool>("--resolve_customop_max_pool_with_argmax"))
     options->enable(Algorithms::ResolveCustomOpMaxPoolWithArgmax);
+  if (arser.get<bool>("--resolve_customop_splitv"))
+    options->enable(Algorithms::ResolveCustomOpSplitV);
   if (arser.get<bool>("--shuffle_weight_to_16x1float32"))
     options->enable(Algorithms::ShuffleWeightTo16x1Float32);
+  if (arser.get<bool>("--replace_non_const_fc_with_batch_matmul"))
+    options->enable(Algorithms::ReplaceNonConstFCWithBatchMatMul);
   if (arser.get<bool>("--substitute_pack_to_reshape"))
     options->enable(Algorithms::SubstitutePackToReshape);
   if (arser.get<bool>("--substitute_padv2_to_pad"))
@@ -595,37 +351,11 @@ int entry(int argc, char **argv)
     csv_tokenize(csv_nodes, new_outputs);
   }
 
-  // Load model from the file
-  foder::FileLoader file_loader{input_path};
-  std::vector<char> model_data;
-
-  try
-  {
-    model_data = file_loader.load();
-  }
-  catch (const std::runtime_error &err)
-  {
-    std::cerr << err.what() << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
-  if (!circle::VerifyModelBuffer(verifier))
-  {
-    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  const circle::Model *circle_model = circle::GetModel(model_data.data());
-  if (circle_model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
-    return EXIT_FAILURE;
-  }
-
   // Import from input Circle file
-  luci::Importer importer;
-  auto module = importer.importModule(circle_model);
+  luci::ImporterEx importerex;
+  auto module = importerex.importVerifyModule(input_path);
+  if (module.get() == nullptr)
+    return EXIT_FAILURE;
 
   if (change_outputs)
   {
diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
index 76d0f3f7f..9c4256b40 100644
--- a/compiler/circlechef/tools/file/Driver.cpp
+++ b/compiler/circlechef/tools/file/Driver.cpp
@@ -28,10 +28,8 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("recipe")
-    .type(arser::DataType::STR)
-    .help("Source recipe file path to convert");
-  arser.add_argument("circle").type(arser::DataType::STR).help("Target circle file path");
+  arser.add_argument("recipe").help("Source recipe file path to convert");
+  arser.add_argument("circle").help("Target circle file path");
 
   try
   {
diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
index 639e0af6f..c8ef07c6f 100644
--- a/compiler/circlechef/tools/reverse/Driver.cpp
+++ b/compiler/circlechef/tools/reverse/Driver.cpp
@@ -25,10 +25,8 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("circle")
-    .type(arser::DataType::STR)
-    .help("Source circle file path to convert");
-  arser.add_argument("recipe").type(arser::DataType::STR).help("Target recipe file path");
+  arser.add_argument("circle").help("Source circle file path to convert");
+  arser.add_argument("recipe").help("Target recipe file path");
 
   try
   {
diff --git a/compiler/circledump/CMakeLists.txt b/compiler/circledump/CMakeLists.txt
index b65c06677..7485ff8e7 100644
--- a/compiler/circledump/CMakeLists.txt
+++ b/compiler/circledump/CMakeLists.txt
@@ -10,6 +10,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_executable(circledump ${DRIVER} ${SOURCES})
 target_include_directories(circledump PRIVATE include)
 target_link_libraries(circledump arser)
+target_link_libraries(circledump foder)
 target_link_libraries(circledump mio_circle04)
 target_link_libraries(circledump mio_circle04_helper)
 target_link_libraries(circledump safemain)
diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
index 657f24fe0..5b0871a91 100644
--- a/compiler/circledump/driver/Driver.cpp
+++ b/compiler/circledump/driver/Driver.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <arser/arser.h>
-#include <circleread/Model.h>
+#include <foder/FileLoader.h>
 #include <circledump/Dump.h>
 
 #include <iostream>
@@ -23,7 +23,7 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("circle").type(arser::DataType::STR).help("Circle file path to dump");
+  arser.add_argument("circle").help("Circle file path to dump");
 
   try
   {
@@ -38,14 +38,10 @@ int entry(int argc, char **argv)
 
   std::string circle_path = arser.get<std::string>("circle");
   // Load Circle model from a circle file
-  std::unique_ptr<circleread::Model> model = circleread::load_circle(circle_path);
-  if (model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << circle_path << "'" << std::endl;
-    return 255;
-  }
-
-  const circle::Model *circlemodel = model->model();
+  foder::FileLoader fileLoader{circle_path};
+  std::vector<char> modelData = fileLoader.load();
+  const circle::Model *circlemodel = circle::GetModel(modelData.data());
+  // const circle::Model *circlemodel = model->model();
   if (circlemodel == nullptr)
   {
     std::cerr << "ERROR: Failed to load circle '" << circle_path << "'" << std::endl;
diff --git a/compiler/circledump/requires.cmake b/compiler/circledump/requires.cmake
index 362d67cf4..183dfe227 100644
--- a/compiler/circledump/requires.cmake
+++ b/compiler/circledump/requires.cmake
@@ -1,3 +1,4 @@
 require("arser")
+require("foder")
 require("mio-circle04")
 require("safemain")
diff --git a/compiler/circledump/src/Dump.cpp b/compiler/circledump/src/Dump.cpp
index 0b256dda8..69427a20e 100644
--- a/compiler/circledump/src/Dump.cpp
+++ b/compiler/circledump/src/Dump.cpp
@@ -16,8 +16,8 @@
 
 #include <circledump/Dump.h>
 #include <mio_circle/Helper.h>
+#include <mio_circle/Reader.h>
 
-#include "Read.h"
 #include "OpPrinter.h"
 #include "MetadataPrinter.h"
 
@@ -122,7 +122,7 @@ std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
   return os;
 }
 
-void dump_sub_graph(std::ostream &os, circleread::Reader &reader)
+void dump_sub_graph(std::ostream &os, mio::circle::Reader &reader)
 {
   auto tensors = reader.tensors();
   auto operators = reader.operators();
@@ -150,14 +150,14 @@ void dump_sub_graph(std::ostream &os, circleread::Reader &reader)
     std::vector<int32_t> dims = {-1};
 
     if (tensor->shape())
-      dims = circleread::as_index_vector(tensor->shape());
+      dims = mio::circle::as_index_vector(tensor->shape());
 
     os << "T(" << reader.subgraph_index() << ":" << i << ") " << mio::circle::tensor_type(tensor)
        << " ";
     os << "(" << dims << ") ";
     if (tensor->shape_signature())
     {
-      std::vector<int32_t> dims_sig = circleread::as_index_vector(tensor->shape_signature());
+      std::vector<int32_t> dims_sig = mio::circle::as_index_vector(tensor->shape_signature());
       os << "(" << dims_sig << ") ";
     }
     os << "B(" << tensor->buffer() << ") ";
@@ -299,8 +299,8 @@ void dump_sub_graph(std::ostream &os, circleread::Reader &reader)
     const auto op = operators->Get(i);
     circle::BuiltinOperator builtincode = reader.builtin_code(op);
 
-    const std::vector<int32_t> &inputs = circleread::as_index_vector(op->inputs());
-    const std::vector<int32_t> &outputs = circleread::as_index_vector(op->outputs());
+    const std::vector<int32_t> &inputs = mio::circle::as_index_vector(op->inputs());
+    const std::vector<int32_t> &outputs = mio::circle::as_index_vector(op->outputs());
     auto op_name = reader.opcode_name(op);
 
     os << "O(" << reader.subgraph_index() << ":" << i << ") " << op_name << " ";
@@ -356,7 +356,7 @@ void dump_sub_graph(std::ostream &os, circleread::Reader &reader)
 
 void dump_model(std::ostream &os, const circle::Model *model)
 {
-  circleread::Reader reader(model);
+  mio::circle::Reader reader(model);
 
   uint32_t num_subgraph = reader.num_subgraph();
 
diff --git a/compiler/circledump/src/Load.cpp b/compiler/circledump/src/Load.cpp
deleted file mode 100644
index 67e7fa5a6..000000000
--- a/compiler/circledump/src/Load.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <circleread/Model.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-namespace
-{
-
-class MemoryMappedModel final : public circleread::Model
-{
-public:
-  /**
-   * @require fd and data SHOULD be valid
-   */
-  explicit MemoryMappedModel(int fd, void *data, size_t size) : _fd{fd}, _data{data}, _size{size}
-  {
-    // DO NOTHING
-  }
-
-public:
-  ~MemoryMappedModel()
-  {
-    munmap(_data, _size);
-    close(_fd);
-  }
-
-public:
-  MemoryMappedModel(const MemoryMappedModel &) = delete;
-  MemoryMappedModel(MemoryMappedModel &&) = delete;
-
-public:
-  const ::circle::Model *model(void) const override { return ::circle::GetModel(_data); }
-
-private:
-  int _fd = -1;
-  void *_data = nullptr;
-  size_t _size = 0;
-};
-
-class FileDescriptor final
-{
-public:
-  FileDescriptor(int value) : _value{value}
-  {
-    // DO NOTHING
-  }
-
-public:
-  // NOTE Copy is not allowed
-  FileDescriptor(const FileDescriptor &) = delete;
-
-public:
-  // NOTE Move is allowed
-  FileDescriptor(FileDescriptor &&fd) { _value = fd.release(); }
-
-public:
-  ~FileDescriptor()
-  {
-    if (_value != -1)
-    {
-      // Close on destructor
-      close(_value);
-    }
-  }
-
-public:
-  int value(void) const { return _value; }
-
-public:
-  int release(void)
-  {
-    auto res = _value;
-    _value = -1;
-    return res;
-  }
-
-private:
-  int _value = -1;
-};
-
-} // namespace
-
-namespace circleread
-{
-
-std::unique_ptr<Model> load_circle(const std::string &path)
-{
-  FileDescriptor fd = open(path.c_str(), O_RDONLY);
-
-  if (fd.value() == -1)
-  {
-    // Return nullptr on open failure
-    return nullptr;
-  }
-
-  struct stat st;
-  if (fstat(fd.value(), &st) == -1)
-  {
-    // Return nullptr on fstat failure
-    return nullptr;
-  }
-
-  auto size = st.st_size;
-  auto data = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd.value(), 0);
-
-  if (data == MAP_FAILED)
-  {
-    // Return nullptr on mmap failure
-    return nullptr;
-  }
-
-  return std::unique_ptr<circleread::Model>{new MemoryMappedModel(fd.release(), data, size)};
-}
-
-} // namespace circleread
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index 02e5c26b5..817371dcf 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -15,7 +15,8 @@
  */
 
 #include "OpPrinter.h"
-#include "Read.h"
+
+#include <mio_circle/Helper.h>
 
 #include <memory>
 
@@ -233,7 +234,7 @@ public:
   {
     if (auto *reshape_params = op->builtin_options_as_ReshapeOptions())
     {
-      auto new_shape = circleread::as_index_vector(reshape_params->new_shape());
+      auto new_shape = mio::circle::as_index_vector(reshape_params->new_shape());
       os << "    ";
       os << "NewShape(" << new_shape << ")";
       os << std::endl;
@@ -802,6 +803,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   // There is no Option for CEIL
   _op_map[circle::BuiltinOperator_CONCATENATION] = make_unique<ConcatenationPrinter>();
   _op_map[circle::BuiltinOperator_CONV_2D] = make_unique<Conv2DPrinter>();
+  // There is no Option for DENSIFY
   _op_map[circle::BuiltinOperator_DEPTH_TO_SPACE] = make_unique<DepthToSpacePrinter>();
   _op_map[circle::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
   // There is no Option for DEQUANTIZE
diff --git a/compiler/circledump/src/Read.cpp b/compiler/circledump/src/Read.cpp
deleted file mode 100644
index 3a7e98cde..000000000
--- a/compiler/circledump/src/Read.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Read.h"
-
-#include <mio_circle/Helper.h>
-
-#include <sstream>
-#include <string>
-
-namespace circleread
-{
-
-Reader::Reader(const circle::Model *model)
-{
-  _version = model->version();
-  _subgraphs = model->subgraphs();
-  _buffers = model->buffers();
-  _metadata = model->metadata();
-  _signature_defs = model->signature_defs();
-
-  auto opcodes = model->operator_codes();
-  for (const ::circle::OperatorCode *opcode : *opcodes)
-  {
-    _op_codes.push_back(opcode);
-  }
-}
-
-size_t Reader::buffer_info(uint32_t buf_idx, const uint8_t **buff_data)
-{
-  *buff_data = nullptr;
-
-  if (buf_idx == 0)
-    return 0;
-
-  if (auto *buffer = (*_buffers)[buf_idx])
-  {
-    if (auto *array = buffer->data())
-    {
-      if (size_t size = array->size())
-      {
-        *buff_data = reinterpret_cast<const uint8_t *>(array->data());
-        return size;
-      }
-    }
-  }
-
-  return 0;
-}
-
-circle::BuiltinOperator Reader::builtin_code(const circle::Operator *op) const
-{
-  uint32_t index = op->opcode_index();
-  assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
-
-  return opcode->builtin_code();
-}
-
-std::string Reader::opcode_name(const circle::Operator *op) const
-{
-  uint32_t index = op->opcode_index();
-  assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
-
-  if (!mio::circle::is_valid(opcode))
-  {
-    std::ostringstream oss;
-    oss << "(invalid: " << index << ")";
-    return oss.str();
-  }
-
-  return mio::circle::opcode_name(opcode);
-}
-
-bool Reader::select_subgraph(uint32_t sgindex)
-{
-  _subgraph_index = sgindex;
-  _tensors = nullptr;
-  _operators = nullptr;
-
-  _inputs.clear();
-  _outputs.clear();
-
-  if (_subgraphs->Length() <= sgindex)
-  {
-    assert(false);
-    return false;
-  }
-
-  const circle::SubGraph *subgraph = (*_subgraphs)[sgindex];
-
-  auto name = subgraph->name();
-  _subgraph_name = name ? name->c_str() : "(noname)";
-
-  _tensors = subgraph->tensors();
-  _operators = subgraph->operators();
-  _data_format = subgraph->data_format();
-
-  _inputs = as_index_vector(subgraph->inputs());
-  _outputs = as_index_vector(subgraph->outputs());
-
-  return true;
-}
-
-} // namespace circleread
diff --git a/compiler/cli/CMakeLists.txt b/compiler/cli/CMakeLists.txt
index 0fb99ddba..4ab0ea218 100644
--- a/compiler/cli/CMakeLists.txt
+++ b/compiler/cli/CMakeLists.txt
@@ -10,5 +10,5 @@ endif(NOT ENABLE_TEST)
 
 nnas_find_package(GTest QUIET)
 
-GTest_AddTEst(cli_test ${TESTS})
+GTest_AddTest(cli_test ${TESTS})
 target_link_libraries(cli_test cli)
diff --git a/compiler/coco/core/src/IR/Module.cpp b/compiler/coco/core/src/IR/Module.cpp
index 420cf6f0c..0db78941c 100644
--- a/compiler/coco/core/src/IR/Module.cpp
+++ b/compiler/coco/core/src/IR/Module.cpp
@@ -144,7 +144,7 @@ std::unique_ptr<Module> Module::create(void)
   m->_input = make_unique<coco::InputList>();
   m->_output = make_unique<coco::OutputList>();
 
-  return std::move(m);
+  return m;
 }
 
 } // namespace coco
diff --git a/compiler/coco/generic/src/IR/Data.cpp b/compiler/coco/generic/src/IR/Data.cpp
index 5ab7069ee..361dcc243 100644
--- a/compiler/coco/generic/src/IR/Data.cpp
+++ b/compiler/coco/generic/src/IR/Data.cpp
@@ -209,8 +209,7 @@ std::unique_ptr<Data> Data::create(void)
   data->_blob = std::move(blob);
   data->_fp32 = std::move(fp32);
 
-  // GCC 4.9 tries to copy data (while GCC 6.X doesn't)
-  return std::move(data);
+  return data;
 }
 
 } // namespace coco
diff --git a/compiler/common-artifacts/CMakeLists.txt b/compiler/common-artifacts/CMakeLists.txt
index 404149c15..34a3a4d7d 100644
--- a/compiler/common-artifacts/CMakeLists.txt
+++ b/compiler/common-artifacts/CMakeLists.txt
@@ -12,14 +12,6 @@ if(${PYTHON_VERSION_MINOR} LESS 8)
   return()
 endif()
 
-# Create python virtual environment with tensorflow 2.6.0
-set(VIRTUALENV_OVERLAY_TF_2_6_0 "${NNCC_OVERLAY_DIR}/venv_2_6_0")
-
-add_custom_command(
-  OUTPUT ${VIRTUALENV_OVERLAY_TF_2_6_0}
-  COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY_TF_2_6_0}
-)
-
 # Create python virtual environment with tensorflow 2.8.0
 set(VIRTUALENV_OVERLAY_TF_2_8_0 "${NNCC_OVERLAY_DIR}/venv_2_8_0")
 
@@ -30,33 +22,36 @@ add_custom_command(
 
 # Create requirements.txt and install required pip packages
 set(REQUIREMENTS_FILE "requirements.txt")
-set(REQUIREMENTS_OVERLAY_PATH_TF_2_6_0 "${VIRTUALENV_OVERLAY_TF_2_6_0}/${REQUIREMENTS_FILE}")
 set(REQUIREMENTS_OVERLAY_PATH_TF_2_8_0 "${VIRTUALENV_OVERLAY_TF_2_8_0}/${REQUIREMENTS_FILE}")
 
-add_custom_command(
-  OUTPUT ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
-  COMMAND ${CMAKE_COMMAND} -E remove -f ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
-  COMMAND ${CMAKE_COMMAND} -E echo "tensorflow-cpu==2.6.0" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
-  COMMAND ${CMAKE_COMMAND} -E echo "flatbuffers==1.12" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
-  COMMAND ${VIRTUALENV_OVERLAY_TF_2_6_0}/bin/python3.8 -m pip --default-timeout=1000 install --upgrade pip setuptools
-  COMMAND ${VIRTUALENV_OVERLAY_TF_2_6_0}/bin/python3.8 -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0} --upgrade
-  DEPENDS ${VIRTUALENV_OVERLAY_TF_2_6_0}
-)
+set(PYTHON_OVERLAY python3)
+if(PYTHON_EXECUTABLE MATCHES python3.8)
+  set(PYTHON_OVERLAY python3.8)
+endif()
 
+# NOTE when using behind proxy with self signed certificate, need to set '--trusted-host' options
+set(PIP_OPTION_TRUSTED_HOST )
+if(DEFINED ENV{ONE_PIP_OPTION_TRUST_HOST})
+  set(PIP_OPTION_TRUSTED_HOST --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --trusted-host pypi.org)
+endif()
+
+# NOTE refer https://github.com/protocolbuffers/protobuf/issues/10051
+# TODO remove protobuf==3.20.1 when issue is resolved
 add_custom_command(
   OUTPUT ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
   COMMAND ${CMAKE_COMMAND} -E remove -f ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
   COMMAND ${CMAKE_COMMAND} -E echo "tensorflow-cpu==2.8.0" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
   COMMAND ${CMAKE_COMMAND} -E echo "flatbuffers==1.12" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
-  COMMAND ${VIRTUALENV_OVERLAY_TF_2_8_0}/bin/python3.8 -m pip --default-timeout=1000 install --upgrade pip setuptools
-  COMMAND ${VIRTUALENV_OVERLAY_TF_2_8_0}/bin/python3.8 -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0} --upgrade
+  COMMAND ${CMAKE_COMMAND} -E echo "protobuf==3.20.1" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_8_0}/bin/${PYTHON_OVERLAY} -m pip --default-timeout=1000
+          ${PIP_OPTION_TRUSTED_HOST} install --upgrade pip setuptools
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_8_0}/bin/${PYTHON_OVERLAY} -m pip --default-timeout=1000
+          ${PIP_OPTION_TRUSTED_HOST} install -r ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0} --upgrade
   DEPENDS ${VIRTUALENV_OVERLAY_TF_2_8_0}
 )
 
 add_custom_target(common_artifacts_python_deps ALL
-  DEPENDS ${VIRTUALENV_OVERLAY_TF_2_6_0}
-          ${VIRTUALENV_OVERLAY_TF_2_8_0}
-          ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_2_8_0}
           ${REQUIREMENTS_OVERLAY_PATH_TF_2_8_0}
 )
 
@@ -246,7 +241,13 @@ foreach(RECIPE IN ITEMS ${RECIPES})
   if(NOT DEFINED NO_OPTIMIZE_${RECIPE})
     # Generate optimized .circle
     add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:circle2circle> --O1 ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+      # NOTE --resolve_customop_add is just to added for old -O1, no particular meaning
+      #      --fold_dequantize is added to fold Tensor(FLOAT16) + DEQUANTIZE (Net_Dequantize_Add)
+      #      model. FLOAT16 in general is NOT supported but only Tensor(FLOAT16) + DEQUANTIZE
+      #      sequence accepted as folded to Tensor(FLOAT32).
+      # TODO revise giving options from the list file
+      COMMAND $<TARGET_FILE:circle2circle> --resolve_customop_add --fold_dequantize --fold_densify
+              ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
       DEPENDS $<TARGET_FILE:circle2circle>  ${CIRCLE_OUTPUT_PATH}
       COMMENT "Generate ${OPT_CIRCLE_FILE}"
     )
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index 92b07fde8..2275a42d9 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -32,6 +32,7 @@ tcgenerate(BroadcastTo_000) # luci-interpreter doesn't support custom operator
 tcgenerate(Ceil_000)
 tcgenerate(Conv2D_003) # runtime doesn't support dilation
 tcgenerate(Cos_000)
+tcgenerate(Densify_000) # luci-interpreter doesn't support
 tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
@@ -67,6 +68,8 @@ tcgenerate(Neg_000)
 tcgenerate(Net_BroadcastTo_AddV2_001) # luci-interpreter doesn't support custom operator
 tcgenerate(Net_Conv_FakeQuant_000) # luci-interpreter doesn't support FakeQuant yet
 tcgenerate(Net_Dangle_001)
+tcgenerate(Net_Densify_Add_000) # luci-interpreter doesn't support Densify yet
+tcgenerate(Net_Densify_Dequantize_Add_000) # luci-interpreter doesn't support Densify/Dequantize yet
 tcgenerate(Net_Gather_SparseToDense_AddV2_000) # luci-interpreter doesn't support custom operator
 tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
 tcgenerate(OneHot_000)
diff --git a/compiler/common-artifacts/src/TestDataGenerator.cpp b/compiler/common-artifacts/src/TestDataGenerator.cpp
index 33cecbbe2..7481050c5 100644
--- a/compiler/common-artifacts/src/TestDataGenerator.cpp
+++ b/compiler/common-artifacts/src/TestDataGenerator.cpp
@@ -142,23 +142,15 @@ void fill_random_range(void *data, uint32_t size, loco::DataType dtype, int32_t
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("circle").type(arser::DataType::STR).help("Circle file you want to test");
-  arser.add_argument("--input_data")
-    .required(true)
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Path to generate input data h5 file");
+  arser.add_argument("circle").help("Circle file you want to test");
+  arser.add_argument("--input_data").required(true).help("Path to generate input data h5 file");
   arser.add_argument("--expected_data")
     .required(true)
-    .nargs(1)
-    .type(arser::DataType::STR)
     .help("Path to generate expected data h5 file");
   arser.add_argument("--fixed_seed")
-    .required(false)
     .nargs(0)
     .help("Put a fixed seed into the random number generator");
   arser.add_argument("--input_range")
-    .required(false)
     .nargs(3)
     .type(arser::DataType::STR_VEC)
     .help("Set random number range [min max] for the input as 'name min max'");
diff --git a/compiler/crew/CMakeLists.txt b/compiler/crew/CMakeLists.txt
index 1824d86ab..45cda7562 100644
--- a/compiler/crew/CMakeLists.txt
+++ b/compiler/crew/CMakeLists.txt
@@ -12,9 +12,12 @@ if(NOT ENABLE_TEST)
   return()
 endif(NOT ENABLE_TEST)
 
+configure_file("src/test_read_semicolon.ini" "test_read_semicolon.ini" COPYONLY)
+
 nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(crew_test ${TESTS})
 target_include_directories(crew_test PRIVATE src)
 target_link_libraries(crew_test nncc_common)
 target_link_libraries(crew_test crew)
+target_link_libraries(crew_test foder)
diff --git a/compiler/crew/src/PConfigIni.cpp b/compiler/crew/src/PConfigIni.cpp
index f0e3e8e01..5177843bf 100644
--- a/compiler/crew/src/PConfigIni.cpp
+++ b/compiler/crew/src/PConfigIni.cpp
@@ -26,10 +26,36 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <vector>
 
 namespace crew
 {
 
+namespace
+{
+
+std::string filter_escape(const std::string &source)
+{
+  std::string key = source;
+
+  // if key is surrounded with quotation
+  // TODO for quotation
+
+  // if key has '\\' + ';', remove '\\'
+  auto pos = key.find("\\;");
+  while (pos != std::string::npos)
+  {
+    auto k1 = key.substr(0, pos);
+    auto k2 = key.substr(pos + 1);
+    key = k1 + k2;
+    pos = key.find("\\;");
+  }
+
+  return key;
+}
+
+} // namespace
+
 Sections read_ini(const char *data, size_t length)
 {
   assert(data != nullptr);
@@ -84,6 +110,7 @@ Sections read_ini(const char *data, size_t length)
       {
         auto key = string_line.substr(0, pos);
         auto val = string_line.substr(pos + 1);
+        key = filter_escape(key);
         section.items.emplace(key, val);
       }
     }
@@ -107,11 +134,53 @@ Sections read_ini(const std::string &path)
   return read_ini(ini_data.data(), ini_data.size());
 }
 
+namespace
+{
+
+void replace(std::string &source, const std::string &token, const std::string &replace)
+{
+  size_t pos = 0;
+  while ((pos = source.find(token, pos)) != std::string::npos)
+  {
+    source.replace(pos, token.length(), replace);
+    pos += replace.length(); // Handles the case where 'replace' is a substring of 'token'
+  }
+}
+
+Sections insert_escape(const Sections &inputs)
+{
+  Sections sections;
+
+  // for all section in sections;
+  // if key has ';' then replace with '\;'
+  for (auto &input : inputs)
+  {
+    Section section;
+    section.name = input.name;
+
+    for (auto &item : input.items)
+    {
+      auto key = item.first;
+      auto value = item.second;
+
+      replace(key, ";", "\\;");
+      section.items[key] = value;
+    }
+    sections.push_back(section);
+  }
+
+  return sections;
+}
+
+} // namespace
+
 void write_ini(std::ostream &os, const Sections &sections)
 {
   std::stringstream ss;
 
-  ss << sections;
+  auto processed = insert_escape(sections);
+
+  ss << processed;
 
   std::string strss = ss.str();
 
diff --git a/compiler/crew/src/PConfigIni.test.cpp b/compiler/crew/src/PConfigIni.test.cpp
index bdd2ccc1f..c062c6937 100644
--- a/compiler/crew/src/PConfigIni.test.cpp
+++ b/compiler/crew/src/PConfigIni.test.cpp
@@ -17,12 +17,14 @@
 #include "crew/PConfigIni.h"
 #include "crew/PConfigIniDump.h"
 
+#include <foder/FileLoader.h>
+
 #include <gtest/gtest.h>
 
 #include <sstream>
 #include <stdexcept>
 
-TEST(ConfigIniTest, read_ini_non_exist_file)
+TEST(ConfigIniTest, read_ini_non_exist_file_NEG)
 {
   EXPECT_THROW(crew::read_ini("/hello/world/not_a_file"), std::runtime_error);
 }
@@ -85,3 +87,60 @@ TEST(ConfigIniTest, write_ini_file_error_NEG)
   crew::Sections sections;
   EXPECT_THROW(crew::write_ini("/abc/def/cannot_access", sections), std::runtime_error);
 }
+
+TEST(ConfigIniTest, read_file_escape_semicolon)
+{
+  auto sections = crew::read_ini("test_read_semicolon.ini");
+  ASSERT_EQ(1UL, sections.size());
+
+  auto its = sections.begin();
+  ASSERT_NE(sections.end(), its);
+  EXPECT_TRUE("hello" == its->name);
+  ASSERT_EQ(1UL, its->items.size());
+
+  auto it = its->items.begin();
+  ASSERT_NE(its->items.end(), it);
+
+  EXPECT_TRUE("keya;keyb;keyc;keyd" == it->first);
+  EXPECT_TRUE("world" == it->second);
+}
+
+TEST(ConfigIniTest, write_file_escape_semicolon)
+{
+  std::string path("test_write_semicolon.ini");
+
+  // save key with ';'
+  {
+    crew::Sections sections;
+    crew::Section hello;
+    hello.name = "hello";
+    hello.items["keya;keyb;keyc;keyd"] = "world";
+    sections.push_back(hello);
+    crew::write_ini(path, sections);
+  }
+
+  // load the file and check if there is '\\'
+  std::string strbuffer;
+  {
+    foder::FileLoader file_loader{path};
+    auto ini_data = file_loader.load();
+
+    auto buffer = std::vector<char>();
+    auto length = ini_data.size();
+    buffer.reserve(length + 1);
+
+    char *pbuffer = buffer.data();
+    memcpy(pbuffer, ini_data.data(), length);
+    *(pbuffer + length) = 0;
+
+    strbuffer = pbuffer;
+  }
+  int32_t count = 0;
+  size_t pos = 0;
+  while ((pos = strbuffer.find("\\;", pos)) != std::string::npos)
+  {
+    count++;
+    pos++;
+  }
+  EXPECT_TRUE(count == 3);
+}
diff --git a/compiler/crew/src/test_read_semicolon.ini b/compiler/crew/src/test_read_semicolon.ini
new file mode 100644
index 000000000..d966fb707
--- /dev/null
+++ b/compiler/crew/src/test_read_semicolon.ini
@@ -0,0 +1,2 @@
+[hello]
+keya\;keyb\;keyc\;keyd=world
diff --git a/compiler/enco/core/src/CppGen/Host.cpp b/compiler/enco/core/src/CppGen/Host.cpp
index 7f9456239..63baf0b31 100644
--- a/compiler/enco/core/src/CppGen/Host.cpp
+++ b/compiler/enco/core/src/CppGen/Host.cpp
@@ -299,7 +299,7 @@ std::unique_ptr<pp::MultiLineText> HostBlockCompiler::compile(const coco::Block
     res->append(ins->accept(prn));
   }
 
-  return std::move(res);
+  return res;
 }
 
 } // namespace enco
diff --git a/compiler/enco/core/src/CppGen/Subnet.cpp b/compiler/enco/core/src/CppGen/Subnet.cpp
index 599b0794e..3fc14edf5 100644
--- a/compiler/enco/core/src/CppGen/Subnet.cpp
+++ b/compiler/enco/core/src/CppGen/Subnet.cpp
@@ -373,7 +373,7 @@ std::unique_ptr<SubnetStruct> SubnetStructBuilder::build(const ANNBinder *binder
   // Finalize compilation
   res->ctor()->append("ANeuralNetworksCompilation_finish(", cname, ");");
 
-  return std::move(res);
+  return res;
 }
 
 std::unique_ptr<pp::MultiLineText> SubnetBlockCompiler::compile(const ANNBinder *binder) const
@@ -415,7 +415,7 @@ std::unique_ptr<pp::MultiLineText> SubnetBlockCompiler::compile(const ANNBinder
 
   res->append("ANeuralNetworksExecution_free(execution);");
 
-  return std::move(res);
+  return res;
 }
 
 } // namespace enco
diff --git a/compiler/enco/core/src/Transforms/Split.cpp b/compiler/enco/core/src/Transforms/Split.cpp
index 714c27a72..4bb21b0a7 100644
--- a/compiler/enco/core/src/Transforms/Split.cpp
+++ b/compiler/enco/core/src/Transforms/Split.cpp
@@ -656,7 +656,7 @@ public:
           app->ofm(ofm);
           app->ker(ker);
 
-          return std::move(app);
+          return app;
         }
         else
         {
@@ -676,7 +676,7 @@ public:
           app->ofm(ofm);
           app->ker(ker);
 
-          return std::move(app);
+          return app;
         }
       }
     }
@@ -704,7 +704,7 @@ public:
         app->right(right);
         app->out(out);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto op = eval->op()->asMul())
@@ -731,7 +731,7 @@ public:
         app->right(right);
         app->out(out);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto op = eval->op()->asPadF())
@@ -754,7 +754,7 @@ public:
         app->ifm(ifm);
         app->ofm(ofm);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto maxpool = eval->op()->asMaxPool2D())
@@ -779,7 +779,7 @@ public:
         app->ifm(ifm);
         app->ofm(ofm);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto avgpool = eval->op()->asAvgPool2D())
@@ -808,7 +808,7 @@ public:
           app->ifm(ifm);
           app->ofm(ofm);
 
-          return std::move(app);
+          return app;
         }
       }
     }
@@ -831,7 +831,7 @@ public:
         app->ifm(ifm);
         app->ofm(ofm);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto relu6 = eval->op()->asReLU6())
@@ -853,7 +853,7 @@ public:
         app->ifm(ifm);
         app->ofm(ofm);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto op = eval->op()->asConcatF())
@@ -880,7 +880,7 @@ public:
         app->right(right);
         app->out(out);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto op = eval->op()->asSub())
@@ -907,7 +907,7 @@ public:
         app->right(right);
         app->out(out);
 
-        return std::move(app);
+        return app;
       }
     }
     else if (auto op = eval->op()->asDiv())
@@ -934,7 +934,7 @@ public:
         app->right(right);
         app->out(out);
 
-        return std::move(app);
+        return app;
       }
     }
 
@@ -967,7 +967,7 @@ std::unique_ptr<ANNOpAppender> make_appender(coco::Instr *ins)
     app->left(depth_concat->fst()->asFeature());
     app->right(depth_concat->snd()->asFeature());
 
-    return std::move(app);
+    return app;
   }
 
   // Build ANN IR from ANNConv2D instruction
@@ -986,7 +986,7 @@ std::unique_ptr<ANNOpAppender> make_appender(coco::Instr *ins)
     app->ker(conv2d->ker()->asKernel());
     app->bias(coco::safe_cast<coco::FeatureObject>(conv2d->bias()));
 
-    return std::move(app);
+    return app;
   }
 
   return nullptr;
diff --git a/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp b/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
index aa2cad705..32ad44385 100644
--- a/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
+++ b/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
@@ -25,6 +25,8 @@
 #include <loco/Service/TypeInference.h>
 #include <loco/Service/ShapeInference.h>
 
+#include <limits>
+
 namespace exo
 {
 
diff --git a/compiler/kuma/src/IntervalSet.h b/compiler/kuma/src/IntervalSet.h
index 3b6c5f666..1e26581c0 100644
--- a/compiler/kuma/src/IntervalSet.h
+++ b/compiler/kuma/src/IntervalSet.h
@@ -17,6 +17,7 @@
 #ifndef __KUMA_DETAILS_LIVE_INTERVAL_SET_H__
 #define __KUMA_DETAILS_LIVE_INTERVAL_SET_H__
 
+#include <cstdint>
 #include <map>
 
 namespace kuma
diff --git a/compiler/loco/include/loco/IR/DataTypeTraits.h b/compiler/loco/include/loco/IR/DataTypeTraits.h
index 1f78c9fec..6be46c3b3 100644
--- a/compiler/loco/include/loco/IR/DataTypeTraits.h
+++ b/compiler/loco/include/loco/IR/DataTypeTraits.h
@@ -83,6 +83,13 @@ template <> struct DataTypeImpl<DataType::U64>
   using Type = uint64_t;
 };
 
+template <> struct DataTypeImpl<DataType::FLOAT16>
+{
+  // float16 type with 16bit value, encoded with help of FP16 library
+  // https://github.com/Maratyszcza/FP16/
+  using Type = uint16_t;
+};
+
 template <> struct DataTypeImpl<DataType::FLOAT32>
 {
   // Use C++ float type for IEEE 32-bit floating-point numbers
@@ -132,6 +139,8 @@ inline uint32_t size(DataType data_type)
       return sizeof(DataTypeImpl<DataType::S64>::Type);
     case DataType::U64:
       return sizeof(DataTypeImpl<DataType::U64>::Type);
+    case DataType::FLOAT16:
+      return sizeof(DataTypeImpl<DataType::FLOAT16>::Type);
     case DataType::FLOAT32:
       return sizeof(DataTypeImpl<DataType::FLOAT32>::Type);
     case DataType::FLOAT64:
diff --git a/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp b/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
index 500f08623..40ddb133b 100644
--- a/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
+++ b/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
@@ -122,9 +122,6 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
     {
       using namespace loco;
 
-      auto encoder = encode_node->encoder();
-      assert(encoder != nullptr);
-
       auto decode_node = dynamic_cast<loco::FeatureDecode *>(encode_node->input());
       if (decode_node == nullptr)
       {
@@ -132,6 +129,9 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
       }
       assert(decode_node->input() != nullptr);
 
+      auto encoder = encode_node->encoder();
+      assert(encoder != nullptr);
+
       auto decoder = decode_node->decoder();
       assert(decoder != nullptr);
 
@@ -302,9 +302,6 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
     {
       using namespace loco;
 
-      auto encoder = encode_node->encoder();
-      assert(encoder != nullptr);
-
       auto decode_node = dynamic_cast<loco::MatrixDecode *>(encode_node->input());
       if (decode_node == nullptr)
       {
@@ -312,6 +309,9 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
       }
       assert(decode_node->input() != nullptr);
 
+      auto encoder = encode_node->encoder();
+      assert(encoder != nullptr);
+
       auto decoder = decode_node->decoder();
       assert(decoder != nullptr);
 
diff --git a/compiler/luci-eval-driver/src/EvalDriver.cpp b/compiler/luci-eval-driver/src/EvalDriver.cpp
index 4762cffe7..0ed35431d 100644
--- a/compiler/luci-eval-driver/src/EvalDriver.cpp
+++ b/compiler/luci-eval-driver/src/EvalDriver.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <luci/Importer.h>
+#include <luci/ImporterEx.h>
 #include <luci_interpreter/Interpreter.h>
 #include <luci/CircleExporter.h>
 #include <luci/CircleFileExpContract.h>
@@ -47,18 +47,6 @@ void writeDataToFile(const std::string &filename, const char *data, size_t data_
   }
 }
 
-std::unique_ptr<luci::Module> importModel(const std::string &filename)
-{
-  std::ifstream fs(filename, std::ifstream::binary);
-  if (fs.fail())
-  {
-    throw std::runtime_error("Cannot open model file \"" + filename + "\".\n");
-  }
-  std::vector<char> model_data((std::istreambuf_iterator<char>(fs)),
-                               std::istreambuf_iterator<char>());
-  return luci::Importer().importModule(circle::GetModel(model_data.data()));
-}
-
 template <typename NodeT> size_t getTensorSize(const NodeT *node)
 {
   uint32_t tensor_size = loco::size(node->dtype());
@@ -91,7 +79,8 @@ int entry(int argc, char **argv)
   const char *output_file = argv[4];
 
   // Load model from the file
-  std::unique_ptr<luci::Module> module = importModel(filename);
+  luci::ImporterEx importer;
+  std::unique_ptr<luci::Module> module = importer.importVerifyModule(filename);
   if (module == nullptr)
   {
     std::cerr << "ERROR: Failed to load '" << filename << "'" << std::endl;
diff --git a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
index d134a6b95..f0df58db3 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -12,6 +12,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -44,6 +45,7 @@ REGISTER_KERNEL(Reshape)
 REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
 REGISTER_KERNEL(SpaceToDepth)
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
index 15ff0327b..efa6b167e 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
@@ -18,7 +18,7 @@
 #define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
index 6046789ae..effb85d54 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
@@ -17,7 +17,7 @@
 #ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
 #define LUCI_INTERPRETER_PAL_QUANTIZE_H
 
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h b/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h
new file mode 100644
index 000000000..813b1ec2c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h
@@ -0,0 +1,1568 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h" // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite
+{
+
+namespace reference_ops
+{
+
+template <typename T>
+inline void Relu(const RuntimeShape &input_shape, const T *input_data,
+                 const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T lower = 0;
+    const T clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
+                  const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = -1;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
+                  const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const int32 val = static_cast<int32_t>(input_data[i]);
+    int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                                         params.output_multiplier,
+                                                                         params.output_shift);
+    clamped = std::max(params.quantized_activation_min, clamped);
+    clamped = std::min(params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
+                                 const RuntimeShape &unswitched_input1_shape,
+                                 const uint8 *unswitched_input1_data,
+                                 const RuntimeShape &unswitched_input2_shape,
+                                 const uint8 *unswitched_input2_data,
+                                 const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched = unswitched_params.broadcast_category ==
+                              tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
+  const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8 *output_data_ptr = output_data;
+  const uint8 *input1_data_ptr = input1_data;
+  const uint8 *input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0)
+  {
+    const uint8 *input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1)
+    {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2)
+      {
+        for (int i3 = 0; i3 < y3; ++i3)
+        {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16");
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                  const int16_t *input1_data, const RuntimeShape &input2_shape,
+                  const int16_t *input2_data, const RuntimeShape &output_shape,
+                  int16_t *output_data)
+{
+  ruy::profiler::ScopeLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+  else
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
+template <typename Scalar>
+void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
+          const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Pack");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < inputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      const Scalar *input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+            const RuntimeShape &output_shape, Scalar *const *output_datas)
+{
+  ruy::profiler::ScopeLabel label("Unpack");
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  int axis = params.axis;
+  if (axis < 0)
+  {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < outputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      Scalar *output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
+                     const uint8 *const *input_data, const RuntimeShape &output_shape,
+                     uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("PackWithScaling");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  const int32 *input_zeropoint = params.input_zeropoint;
+  const float *input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32 output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  Scalar *output_ptr = output_data;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+      {
+        memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+      }
+      else
+      {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j)
+        {
+          const int value =
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
+                        const Scalar *const *input_data, const RuntimeShape &output_shape,
+                        Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("DepthConcatenation");
+  auto params_copy = params;
+  params_copy.axis = 3;
+  Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
+}
+
+inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+                     const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
+                     const float *prev_activ_data, const RuntimeShape &weights_shape,
+                     const float *weights_data, const RuntimeShape &unextended_bias_shape,
+                     const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
+                     const float *prev_state_data,
+                     const RuntimeShape &unextended_output_state_shape, float *output_state_data,
+                     const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
+                     const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
+                     const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
+{
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                                  output_state_shape, 0, output_activ_shape, 0);
+  const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                                 output_state_shape, 1, output_activ_shape, 1);
+  const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                                output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const *> concat_input_arrays_data;
+  std::vector<RuntimeShape const *> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
+                concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
+                 bias_shape, bias_data, activ_temp_shape, activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int w = 0; w < width; ++w)
+    {
+      for (int h = 0; h < height; ++h)
+      {
+        for (int c = 0; c < output_depth; ++c)
+        {
+          const float input_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
+          const float new_input =
+            std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
+          const float output_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
+          const float new_state =
+            input_gate * new_input +
+            forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+            output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void
+LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+         const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
+         const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
+         const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
+         const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
+         const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
+         int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
+         uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
+         uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
+         int16 *activ_temp_data_int16, void *gemmlowp_context)
+{
+  (void)gemmlowp_context; // only used in optimized code.
+  int32 weights_zero_point = params.weights_zero_point;
+  int32 accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
+                                                 output_state_shape, output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+    MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
+  const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
+                concat_temp_shape, concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b)
+  {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c)
+    {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d)
+      {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b)
+  {
+    for (int c = 0; c < output_depth; ++c)
+    {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state =
+        gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+                                prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+        std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
+    }
+  }
+}
+
+template <typename Scalar>
+void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+           const RuntimeShape *const *output_shapes, Scalar *const *output_data)
+{
+  ruy::profiler::ScopeLabel label("Split");
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+
+  int64_t split_size = 0;
+  for (int i = 0; i < outputs_count; i++)
+  {
+    TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(*output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+  return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                                       const RuntimeShape &input_shape, const float *input_data,
+                                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int c = 0; c < depth; ++c)
+    {
+      const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
+      const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
+      {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
+      }
+      const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+    }
+  }
+}
+
+inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
+                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = static_cast<float>(input_data[i]);
+  }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
+                      const float *input_data, const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("FakeQuant");
+  float rmin = op_params.minmax.min;
+  float rmax = op_params.minmax.max;
+  int num_bits = op_params.num_bits;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult
+{
+  int n_slices;
+  int slice_size;
+  int indices_nd;
+  std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
+                                           const RuntimeShape &indices_shape)
+{
+  GatherNdHelperResult ret;
+  ret.n_slices = 1;
+  ret.slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i)
+  {
+    ret.n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = ret.indices_nd; i < params_dims; ++i)
+  {
+    ret.slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+  for (int i = 0; i < ret.indices_nd; ++i)
+  {
+    ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = ret.dims_to_count[i];
+  }
+
+  return ret;
+}
+
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
+                     const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                     const RuntimeShape &output_shape, ParamsT *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNd");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+                sizeof(ParamsT) * res.slice_size);
+  }
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+template <typename IndicesT = int32>
+inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
+                           const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                           const RuntimeShape &output_shape, TfLiteTensor *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNdString");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  DynamicBuffer buffer;
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    for (int j = 0; j < res.slice_size; ++j)
+    {
+      buffer.AddString(GetString(params_data, from_pos + j));
+    }
+  }
+  buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                      const RuntimeShape &updates_shape, const UpdatesT *updates_data,
+                      const RuntimeShape &output_shape, UpdatesT *output_data)
+{
+  ruy::profiler::ScopeLabel label("ScatterNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int outer_dims = indices_shape.DimensionsCount() - 1;
+  const int indices_nd = indices_shape.Dims(outer_dims);
+  const int updates_dims = updates_shape.DimensionsCount();
+  for (int i = 0; i < outer_dims; ++i)
+  {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = outer_dims; i < updates_dims; ++i)
+  {
+    slice_size *= updates_shape.Dims(i);
+  }
+
+  int output_flat_size = output_shape.FlatSize();
+  int remain_flat_size = output_flat_size;
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i)
+  {
+    dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+  for (int i = 0; i < n_slices; ++i)
+  {
+    int to_pos = 0;
+    for (int j = 0; j < indices_nd; ++j)
+    {
+      IndicesT idx = indices_data[i * indices_nd + j];
+      TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int j = 0; j < slice_size; j++)
+    {
+      output_data[to_pos + j] += updates_data[i * slice_size + j];
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
+{
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  std::array<int, 5> start;
+  std::array<int, 5> stop;
+  for (int i = 0; i < 5; ++i)
+  {
+    int padded_i = 5 - i;
+    start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+                ? ext_shape.Dims(i)
+                : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0)
+  {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1)
+    {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2)
+      {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3)
+        {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4)
+          {
+            writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
+{
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto min_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
+            const RuntimeShape &output_shape, T2 *output_data)
+{
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
+                   const RuntimeShape &input2_shape, const T3 *input2_data,
+                   const RuntimeShape &output_shape, T2 *output_data)
+{
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+            const RuntimeShape &input_x_shape, const T *input_x_data,
+            const RuntimeShape &input_y_shape, const T *input_y_data,
+            const RuntimeShape &output_shape, T *output_data)
+{
+  int64_t flatsize;
+  // Allow select operator executions on mixed scalar tensors and one element
+  // tensors.
+  if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+      input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
+  {
+    flatsize = 1;
+  }
+  else
+  {
+    flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+  }
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                   const RuntimeShape &input_x_shape, const T *input_x_data,
+                   const RuntimeShape &input_y_shape, const T *input_y_data,
+                   const RuntimeShape &output_shape, T *output_data)
+{
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  int64_t inner_size;
+  if (input_condition_shape.DimensionsCount() == 0)
+  {
+    inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+  }
+  else
+  {
+    TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
+    inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+  }
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++)
+  {
+    const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                           const RuntimeShape &input_x_shape, const T *input_x_data,
+                           const RuntimeShape &input_y_shape, const T *input_y_data,
+                           const RuntimeShape &output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+
+  NdArrayDesc<4> desc_condition;
+  NdArrayDesc<4> desc_x;
+  NdArrayDesc<4> desc_y;
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+                                      &desc_condition, &desc_x, &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                      T *output_data)
+{
+  const size_t size = input_condition_shape.FlatSize();
+  if (size == 0)
+  {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i)
+  {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_condition_data[i])
+    {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j)
+      {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
+                          T default_value, bool value_is_scalar,
+                          const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar)
+  {
+    for (int i = 0; i < value_count; ++i)
+    {
+      const std::vector<TI> &index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values; // just use the first value.
+      output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i)
+  {
+    const std::vector<TI> &index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+  }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
+                const RuntimeShape &input2_shape, const T *input2_data,
+                const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
+                               const RuntimeShape &unextended_input2_shape, const T *input2_data,
+                               const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
+             const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < dims_at_axis; ++j)
+    {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar *output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
+                     const RuntimeShape &input_shape, const Scalar *input_data,
+                     const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i)
+  {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar *output_ptr;
+  if (batch_dim > seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+  else if (batch_dim < seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            if (q > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
+                       const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
+                       const RuntimeShape &output_shape, T *output_data)
+{
+  const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+  for (int i = 0; i < input_shape.Dims(0); i++)
+  {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j)
+    {
+      output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
+    }
+  }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
index 428b15ee0..1e6c41ecc 100644
--- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -13,6 +13,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -48,6 +49,7 @@ REGISTER_KERNEL(PadV2)
 REGISTER_KERNEL(Pow)
 REGISTER_KERNEL(PRelu)
 REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(ReduceMax)
 REGISTER_KERNEL(Relu)
 REGISTER_KERNEL(Relu6)
 REGISTER_KERNEL(Reshape)
@@ -55,6 +57,7 @@ REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(ReverseV2)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Slice)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
diff --git a/compiler/luci-interpreter/pal/linux/PALreference_ops.h b/compiler/luci-interpreter/pal/linux/PALreference_ops.h
new file mode 100644
index 000000000..825ebfe8e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALreference_ops.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
index d134a6b95..f0df58db3 100644
--- a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -12,6 +12,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -44,6 +45,7 @@ REGISTER_KERNEL(Reshape)
 REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
 REGISTER_KERNEL(SpaceToDepth)
diff --git a/compiler/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
index 15ff0327b..efa6b167e 100644
--- a/compiler/luci-interpreter/pal/mcu/PALDequantize.h
+++ b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
@@ -18,7 +18,7 @@
 #define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
index 6046789ae..effb85d54 100644
--- a/compiler/luci-interpreter/pal/mcu/PALQuantize.h
+++ b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
@@ -17,7 +17,7 @@
 #ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
 #define LUCI_INTERPRETER_PAL_QUANTIZE_H
 
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/mcu/PALreference_ops.h b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
new file mode 100644
index 000000000..62c720937
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h" // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite
+{
+
+namespace reference_ops
+{
+
+template <typename T>
+inline void Relu(const RuntimeShape &input_shape, const T *input_data,
+                 const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T lower = 0;
+    const T clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
+                  const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = -1;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
+                  const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const int32 val = static_cast<int32_t>(input_data[i]);
+    int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                                         params.output_multiplier,
+                                                                         params.output_shift);
+    clamped = std::max(params.quantized_activation_min, clamped);
+    clamped = std::min(params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
+                                 const RuntimeShape &unswitched_input1_shape,
+                                 const uint8 *unswitched_input1_data,
+                                 const RuntimeShape &unswitched_input2_shape,
+                                 const uint8 *unswitched_input2_data,
+                                 const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched = unswitched_params.broadcast_category ==
+                              tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
+  const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8 *output_data_ptr = output_data;
+  const uint8 *input1_data_ptr = input1_data;
+  const uint8 *input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0)
+  {
+    const uint8 *input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1)
+    {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2)
+      {
+        for (int i3 = 0; i3 < y3; ++i3)
+        {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16");
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                  const int16_t *input1_data, const RuntimeShape &input2_shape,
+                  const int16_t *input2_data, const RuntimeShape &output_shape,
+                  int16_t *output_data)
+{
+  ruy::profiler::ScopeLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+  else
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
+template <typename Scalar>
+void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
+          const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Pack");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < inputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      const Scalar *input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+            const RuntimeShape &output_shape, Scalar *const *output_datas)
+{
+  ruy::profiler::ScopeLabel label("Unpack");
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  int axis = params.axis;
+  if (axis < 0)
+  {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < outputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      Scalar *output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
+                     const uint8 *const *input_data, const RuntimeShape &output_shape,
+                     uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("PackWithScaling");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  const int32 *input_zeropoint = params.input_zeropoint;
+  const float *input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32 output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  Scalar *output_ptr = output_data;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+      {
+        memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+      }
+      else
+      {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j)
+        {
+          const int value =
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
+                        const Scalar *const *input_data, const RuntimeShape &output_shape,
+                        Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("DepthConcatenation");
+  auto params_copy = params;
+  params_copy.axis = 3;
+  Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
+}
+
+inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+                     const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
+                     const float *prev_activ_data, const RuntimeShape &weights_shape,
+                     const float *weights_data, const RuntimeShape &unextended_bias_shape,
+                     const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
+                     const float *prev_state_data,
+                     const RuntimeShape &unextended_output_state_shape, float *output_state_data,
+                     const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
+                     const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
+                     const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
+{
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                                  output_state_shape, 0, output_activ_shape, 0);
+  const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                                 output_state_shape, 1, output_activ_shape, 1);
+  const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                                output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const *> concat_input_arrays_data;
+  std::vector<RuntimeShape const *> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
+                concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
+                 bias_shape, bias_data, activ_temp_shape, activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int w = 0; w < width; ++w)
+    {
+      for (int h = 0; h < height; ++h)
+      {
+        for (int c = 0; c < output_depth; ++c)
+        {
+          const float input_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
+          const float new_input =
+            std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
+          const float output_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
+          const float new_state =
+            input_gate * new_input +
+            forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+            output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void
+LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+         const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
+         const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
+         const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
+         const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
+         const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
+         int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
+         uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
+         uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
+         int16 *activ_temp_data_int16, void *gemmlowp_context)
+{
+  (void)gemmlowp_context; // only used in optimized code.
+  int32 weights_zero_point = params.weights_zero_point;
+  int32 accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
+                                                 output_state_shape, output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+    MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
+  const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
+                concat_temp_shape, concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b)
+  {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c)
+    {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d)
+      {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b)
+  {
+    for (int c = 0; c < output_depth; ++c)
+    {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state =
+        gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+                                prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+        std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
+    }
+  }
+}
+
+template <typename Scalar>
+void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+           const RuntimeShape *const *output_shapes, Scalar *const *output_data)
+{
+  ruy::profiler::ScopeLabel label("Split");
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+
+  int64_t split_size = 0;
+  for (int i = 0; i < outputs_count; i++)
+  {
+    TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(*output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+  return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                                       const RuntimeShape &input_shape, const float *input_data,
+                                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int c = 0; c < depth; ++c)
+    {
+      const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
+      const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
+      {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
+      }
+      const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+    }
+  }
+}
+
+inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
+                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = static_cast<float>(input_data[i]);
+  }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
+                      const float *input_data, const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("FakeQuant");
+  float rmin = op_params.minmax.min;
+  float rmax = op_params.minmax.max;
+  int num_bits = op_params.num_bits;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult
+{
+  int n_slices;
+  int slice_size;
+  int indices_nd;
+  std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
+                                           const RuntimeShape &indices_shape)
+{
+  GatherNdHelperResult ret;
+  ret.n_slices = 1;
+  ret.slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i)
+  {
+    ret.n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = ret.indices_nd; i < params_dims; ++i)
+  {
+    ret.slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+  for (int i = 0; i < ret.indices_nd; ++i)
+  {
+    ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = ret.dims_to_count[i];
+  }
+
+  return ret;
+}
+
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
+                     const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                     const RuntimeShape &output_shape, ParamsT *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNd");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+                sizeof(ParamsT) * res.slice_size);
+  }
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+template <typename IndicesT = int32>
+inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
+                           const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                           const RuntimeShape &output_shape, TfLiteTensor *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNdString");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  DynamicBuffer buffer;
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    for (int j = 0; j < res.slice_size; ++j)
+    {
+      buffer.AddString(GetString(params_data, from_pos + j));
+    }
+  }
+  buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                      const RuntimeShape &updates_shape, const UpdatesT *updates_data,
+                      const RuntimeShape &output_shape, UpdatesT *output_data)
+{
+  ruy::profiler::ScopeLabel label("ScatterNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int outer_dims = indices_shape.DimensionsCount() - 1;
+  const int indices_nd = indices_shape.Dims(outer_dims);
+  const int updates_dims = updates_shape.DimensionsCount();
+  for (int i = 0; i < outer_dims; ++i)
+  {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = outer_dims; i < updates_dims; ++i)
+  {
+    slice_size *= updates_shape.Dims(i);
+  }
+
+  int output_flat_size = output_shape.FlatSize();
+  int remain_flat_size = output_flat_size;
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i)
+  {
+    dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+  for (int i = 0; i < n_slices; ++i)
+  {
+    int to_pos = 0;
+    for (int j = 0; j < indices_nd; ++j)
+    {
+      IndicesT idx = indices_data[i * indices_nd + j];
+      TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int j = 0; j < slice_size; j++)
+    {
+      output_data[to_pos + j] += updates_data[i * slice_size + j];
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
+{
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  std::array<int, 5> start;
+  std::array<int, 5> stop;
+  for (int i = 0; i < 5; ++i)
+  {
+    int padded_i = 5 - i;
+    start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+                ? ext_shape.Dims(i)
+                : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0)
+  {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1)
+    {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2)
+      {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3)
+        {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4)
+          {
+            writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
+{
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto min_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
+            const RuntimeShape &output_shape, T2 *output_data)
+{
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
+                   const RuntimeShape &input2_shape, const T3 *input2_data,
+                   const RuntimeShape &output_shape, T2 *output_data)
+{
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+            const RuntimeShape &input_x_shape, const T *input_x_data,
+            const RuntimeShape &input_y_shape, const T *input_y_data,
+            const RuntimeShape &output_shape, T *output_data)
+{
+  int64_t flatsize;
+  // Allow select operator executions on mixed scalar tensors and one element
+  // tensors.
+  if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+      input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
+  {
+    flatsize = 1;
+  }
+  else
+  {
+    flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+  }
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                   const RuntimeShape &input_x_shape, const T *input_x_data,
+                   const RuntimeShape &input_y_shape, const T *input_y_data,
+                   const RuntimeShape &output_shape, T *output_data)
+{
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  int64_t inner_size;
+  if (input_condition_shape.DimensionsCount() == 0)
+  {
+    inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+  }
+  else
+  {
+    TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
+    inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+  }
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++)
+  {
+    const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                           const RuntimeShape &input_x_shape, const T *input_x_data,
+                           const RuntimeShape &input_y_shape, const T *input_y_data,
+                           const RuntimeShape &output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+
+  NdArrayDesc<4> desc_condition;
+  NdArrayDesc<4> desc_x;
+  NdArrayDesc<4> desc_y;
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+                                      &desc_condition, &desc_x, &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                      T *output_data)
+{
+  const size_t size = input_condition_shape.FlatSize();
+  if (size == 0)
+  {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i)
+  {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_condition_data[i])
+    {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j)
+      {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
+                          T default_value, bool value_is_scalar,
+                          const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar)
+  {
+    for (int i = 0; i < value_count; ++i)
+    {
+      const std::vector<TI> &index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values; // just use the first value.
+      output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i)
+  {
+    const std::vector<TI> &index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+  }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
+                const RuntimeShape &input2_shape, const T *input2_data,
+                const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
+                               const RuntimeShape &unextended_input2_shape, const T *input2_data,
+                               const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
+             const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < dims_at_axis; ++j)
+    {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar *output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
+                     const RuntimeShape &input_shape, const Scalar *input_data,
+                     const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i)
+  {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar *output_ptr;
+  if (batch_dim > seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+  else if (batch_dim < seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            if (q > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
+                       const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
+                       const RuntimeShape &output_shape, T *output_data)
+{
+  const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+  for (int i = 0; i < input_shape.Dims(0); i++)
+  {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j)
+    {
+      output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
+    }
+  }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index 958fd4b74..6c0220c62 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -170,6 +170,11 @@ struct ResizeNearestNeighborParams
   bool half_pixel_centers;
 };
 
+struct ShapeParams
+{
+  loco::DataType out_type;
+};
+
 struct SubParams
 {
   Activation activation;
diff --git a/compiler/luci-interpreter/src/kernels/Fill.cpp b/compiler/luci-interpreter/src/kernels/Fill.cpp
new file mode 100644
index 000000000..e09d6331a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/Utils.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Fill::Fill(const Tensor *dims, const Tensor *value, Tensor *output)
+  : Kernel({dims, value}, {output})
+{
+}
+
+template <typename T> void Fill::configureShape()
+{
+  const auto dims_data = getTensorData<T>(dims());
+  Shape output_shape(dims()->shape().dim(0));
+
+  for (int i = 0; i < output_shape.num_dims(); ++i)
+  {
+    T data = dims_data[i];
+    if (data < 0)
+      throw std::runtime_error("Fill dimensions must be >= 0");
+
+    output_shape.dim(i) = data;
+  }
+
+  output()->resize(output_shape);
+}
+
+void Fill::configure()
+{
+  const auto dims_shape = dims()->shape();
+  const auto value_shape = value()->shape();
+
+  // Make sure the 1st input tensor is 1-D
+  LUCI_INTERPRETER_CHECK(dims_shape.num_dims() == 1);
+
+  // Make sure the 1st input tensor is int32 or int64
+  LUCI_INTERPRETER_CHECK(dims()->element_type() == DataType::S32 or
+                         dims()->element_type() == DataType::S64);
+
+  // Make sure the 2nd input tensor is a scalar
+  LUCI_INTERPRETER_CHECK(value_shape.num_dims() == 0)
+
+  // Check zero point and scale for S16 and S8
+  if (value()->element_type() == loco::DataType::S16 or
+      value()->element_type() == loco::DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(value()->scale() == output()->scale());
+    LUCI_INTERPRETER_CHECK(value()->zero_point() == output()->zero_point());
+
+    if (value()->element_type() == loco::DataType::S16)
+      LUCI_INTERPRETER_CHECK(value()->zero_point() == 0);
+  }
+  // Resize output
+  switch (dims()->element_type())
+  {
+    case DataType::S32:
+      configureShape<int32_t>();
+      break;
+    case DataType::S64:
+      configureShape<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Fill::execute() const
+{
+  switch (output()->element_type())
+  {
+    case DataType::S8:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int8_t>(value()),
+                                  getTensorShape(output()), getTensorData<int8_t>(output()));
+      break;
+    case DataType::S16:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int16_t>(value()),
+                                  getTensorShape(output()), getTensorData<int16_t>(output()));
+      break;
+    case DataType::S32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int32_t>(value()),
+                                  getTensorShape(output()), getTensorData<int32_t>(output()));
+      break;
+    case DataType::S64:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int64_t>(value()),
+                                  getTensorShape(output()), getTensorData<int64_t>(output()));
+      break;
+    case DataType::FLOAT32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<float>(value()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Fill.h b/compiler/luci-interpreter/src/kernels/Fill.h
new file mode 100644
index 000000000..184f0cb83
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FILL_H
+#define LUCI_INTERPRETER_KERNELS_FILL_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Fill : public Kernel
+{
+public:
+  Fill(const Tensor *dims, const Tensor *value, Tensor *output);
+
+  const Tensor *dims() const { return _inputs[0]; }
+  const Tensor *value() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void configureShape();
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FILL_H
diff --git a/compiler/luci-interpreter/src/kernels/Fill.test.cpp b/compiler/luci-interpreter/src/kernels/Fill.test.cpp
new file mode 100644
index 000000000..cf56df507
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class FillTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T, DataType DT> void runFillIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<T> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+template <DataType DT> void runFillQuantIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<float> value_data = {5};
+
+  int32_t zero_point = 0;
+
+  if (DT == loco::DataType::S8)
+    zero_point = 1;
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, /*scale*/ 0.25, /*zero_point*/ zero_point,
+                                     value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT, /*scale*/ 0.25, /*zero_point*/ zero_point);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, FillInt)
+{
+  // Run for int32_t input
+  runFillIntKernel<int32_t, loco::DataType::S32>(_memory_manager.get());
+  // Run for int64_t input
+  runFillIntKernel<int64_t, loco::DataType::S64>(_memory_manager.get());
+  // Run for int8_t input
+  runFillQuantIntKernel<loco::DataType::S8>(_memory_manager.get());
+  // Run for int16_t input
+  runFillQuantIntKernel<loco::DataType::S16>(_memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(FillTest, FillFloat)
+{
+  Shape dims_shape{3};
+
+  std::vector<int64_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S64>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5, 5, 5};
+
+  std::vector<int32_t> ref_output_shape{2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), ref_output_data);
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, Invalid_Input_Shape_NEG)
+{
+  Shape dims_shape{1, 3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(FillTest, Invalid_Value_Shape_NEG)
+{
+  Shape dims_shape{3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value = makeInputTensor<loco::DataType::FLOAT32>({1}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
index 2fbeefce4..bae1eac70 100644
--- a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
+++ b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
@@ -19,6 +19,8 @@
 
 #include "kernels/Utils.h"
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Pack.cpp b/compiler/luci-interpreter/src/kernels/Pack.cpp
index 6fee93890..42aab330c 100644
--- a/compiler/luci-interpreter/src/kernels/Pack.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pack.cpp
@@ -76,9 +76,8 @@ void Pack::configure()
     }
   }
 
-  if (t0->element_type() == DataType::S32 || t0->element_type() == DataType::U8 ||
-      t0->element_type() == DataType::S8 || t0->element_type() == DataType::S16 ||
-      t0->element_type() == DataType::S64)
+  if (t0->element_type() == DataType::U8 || t0->element_type() == DataType::S8 ||
+      t0->element_type() == DataType::S16)
   {
     LUCI_INTERPRETER_CHECK(output()->zero_point() == t0->zero_point());
     LUCI_INTERPRETER_CHECK(output()->scale() == t0->scale());
diff --git a/compiler/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
index 2404e4303..d16320b78 100644
--- a/compiler/luci-interpreter/src/kernels/Pack.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
@@ -38,18 +38,26 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
   std::vector<Tensor> tmp_inputs;
   for (int i = 0; i < input_datas.size(); i++)
   {
-    if (std::is_same<T, float>::value)
+    if (std::is_same<T, float>::value || std::is_same<T, int32_t>::value ||
+        std::is_same<T, int64_t>::value)
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {}, ""));
       memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
-    else
+    else if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f / 255}, {128}}, ""));
       memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
+    else
+    {
+      assert((std::is_same<T, int16_t>::value) && "unexpected dtype is tested");
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f}, {0}}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
   }
   for (int i = 0; i < input_datas.size(); i++)
   {
@@ -57,10 +65,14 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
   }
 
   Tensor output_tensor = makeOutputTensor(element_type);
-  if (!std::is_same<T, float>::value)
+  if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
   {
     output_tensor = makeOutputTensor(element_type, 1.0f / 255, 128);
   }
+  else if (std::is_same<T, int16_t>::value)
+  {
+    output_tensor = makeOutputTensor(element_type, 1.0f, 0);
+  }
 
   PackParams params{};
   params.axis = axis;
@@ -79,7 +91,7 @@ template <typename T> class PackTest : public ::testing::Test
 {
 };
 
-using DataTypes = ::testing::Types<uint8_t, float>;
+using DataTypes = ::testing::Types<uint8_t, int8_t, int16_t, int32_t, int64_t, float>;
 TYPED_TEST_SUITE(PackTest, DataTypes);
 
 TYPED_TEST(PackTest, ThreeInputs)
diff --git a/compiler/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-interpreter/src/kernels/Pad.cpp
index fe172884b..c07f6e310 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.cpp
@@ -20,6 +20,8 @@
 
 #include <tensorflow/lite/kernels/internal/reference/pad.h>
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/PadV2.cpp b/compiler/luci-interpreter/src/kernels/PadV2.cpp
index e90469239..197cdaa69 100644
--- a/compiler/luci-interpreter/src/kernels/PadV2.cpp
+++ b/compiler/luci-interpreter/src/kernels/PadV2.cpp
@@ -20,6 +20,8 @@
 
 #include <tensorflow/lite/kernels/internal/reference/pad.h>
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.cpp b/compiler/luci-interpreter/src/kernels/ReduceMax.cpp
new file mode 100644
index 000000000..d58cd1563
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReduceMax.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reduce.h>
+
+#include <stdexcept>
+#include <limits>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+// Returns the number of axes that will be reduced. Removes duplicates.
+static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims)
+{
+  int reduction_count = num_axes;
+  for (int i = 0; i < num_axes; ++i)
+  {
+    int current = axes_data[i] >= 0 ? axes_data[i] : axes_data[i] + input_num_dims;
+    assert(current >= 0 && current < input_num_dims);
+    for (int j = 0; j < i; j++)
+    {
+      int previous = axes_data[j] >= 0 ? axes_data[j] : axes_data[j] + input_num_dims;
+      // This checks for duplicate axis
+      if (current == previous)
+      {
+        --reduction_count;
+        break;
+      }
+    }
+  }
+  return reduction_count;
+}
+
+static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes,
+                            bool keep_dims)
+{
+  int input_num_dims = input_shape.num_dims();
+  if (input_num_dims == 0)
+  {
+    return Shape(0);
+  }
+
+  if (keep_dims)
+  {
+    Shape output_shape(input_num_dims);
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          is_axis = true;
+          break;
+        }
+      }
+      if (is_axis)
+      {
+        output_shape.dim(idx) = 1;
+      }
+      else
+      {
+        output_shape.dim(idx) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+  else
+  {
+    int num_reduce_axes = getAxisReductionCount(axes_data, num_axes, input_num_dims);
+    Shape output_shape(input_num_dims - num_reduce_axes);
+    int num_skip_axes = 0;
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          ++num_skip_axes;
+          is_axis = true;
+          break;
+        }
+      }
+      if (!is_axis)
+      {
+        output_shape.dim(idx - num_skip_axes) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+}
+
+ReduceMax::ReduceMax(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+                     Tensor *resolved_axes, const ReducerParams &params)
+  : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes}, params)
+{
+}
+
+void ReduceMax::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32);
+
+  const Shape &input_shape = input()->shape();
+  int input_num_dims = input_shape.num_dims();
+
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+  LUCI_INTERPRETER_CHECK(num_axes <= 4);
+
+  // We compute shapes of outputs in configure, assuming that outputs have
+  // static shape
+  // TODO Support dynamic shape
+  Shape output_shape = getOutputShape(input_shape, axes_data, num_axes, _params.keep_dims);
+  output()->resize(output_shape);
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+
+  temp_index->resize(Shape(input_num_dims));
+  resolved_axes->resize(Shape(num_axes));
+}
+
+void ReduceMax::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    // TODO Support quantized kernels
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void ReduceMax::evalFloat() const
+{
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+
+  int num_resolved_axis = 0;
+  LUCI_INTERPRETER_CHECK(
+    tflite::reference_ops::ResolveAxis(input()->shape().num_dims(), axes_data, num_axes,
+                                       getTensorData<int>(resolved_axes), &num_resolved_axis));
+
+  float init_value = std::numeric_limits<float>::lowest();
+  tflite::reference_ops::ReduceGeneric<float>(
+    getTensorData<float>(input()), getTensorShape(input()).DimsData(), input()->shape().num_dims(),
+    getTensorData<float>(output()), getTensorShape(output()).DimsData(),
+    output()->shape().num_dims(), axes_data, num_axes, _params.keep_dims,
+    getTensorData<int>(temp_index), getTensorData<int>(resolved_axes), init_value,
+    [](const float current, const float in) -> float { return (in > current) ? in : current; });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.h b/compiler/luci-interpreter/src/kernels/ReduceMax.h
new file mode 100644
index 000000000..25a66278a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
+#define LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <memory>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ReduceMax : public KernelWithParams<ReducerParams>
+{
+public:
+  ReduceMax(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+            Tensor *resolved_axes, const ReducerParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp b/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp
new file mode 100644
index 000000000..ab688827b
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReduceMax.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ReduceMaxTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ReduceMaxTest, FloatNotKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{1, 0, -3, -3};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = false;
+
+  ReduceMax kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes,
+                   params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{23, 24};
+  std::initializer_list<int32_t> ref_output_shape{2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(ReduceMaxTest, FloatKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{0, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  ReduceMax kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes,
+                   params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{20, 22, 24};
+  std::initializer_list<int32_t> ref_output_shape{1, 3, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Shape.cpp b/compiler/luci-interpreter/src/kernels/Shape.cpp
new file mode 100644
index 000000000..0429fe1e5
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/Utils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ShapeKernel::ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params)
+  : KernelWithParams<ShapeParams>({input}, {output}, params)
+{
+}
+
+void ShapeKernel::configure()
+{
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::S32 or
+                         output()->element_type() == DataType::S64);
+  const auto input_shape = input()->shape();
+
+  Shape output_shape(1);
+  output_shape.dim(0) = input_shape.num_dims();
+
+  output()->resize(output_shape);
+}
+
+void ShapeKernel::execute() const
+{
+  switch (params().out_type)
+  {
+    case DataType::S32:
+      evalInt<int32_t>();
+      break;
+    case DataType::S64:
+      evalInt<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void ShapeKernel::evalInt() const
+{
+  const auto input_shape = input()->shape();
+
+  auto output_data = getTensorData<T>(output());
+
+  for (int i = 0; i < input_shape.num_dims(); ++i)
+  {
+    output_data[i] = input_shape.dim(i);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Shape.h b/compiler/luci-interpreter/src/kernels/Shape.h
new file mode 100644
index 000000000..cfaadec91
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SHAPE_H
+#define LUCI_INTERPRETER_KERNELS_SHAPE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ShapeKernel : public KernelWithParams<ShapeParams>
+{
+public:
+  ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalInt() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SHAPE_H
diff --git a/compiler/luci-interpreter/src/kernels/Shape.test.cpp b/compiler/luci-interpreter/src/kernels/Shape.test.cpp
new file mode 100644
index 000000000..4763e016c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.test.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ShapeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T> void runShapeKernel(loco::DataType dataType, IMemoryManager *memory_manager)
+{
+  Shape input_shape{1, 3, 1, 3, 5};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(dataType);
+
+  ShapeParams params{};
+  params.out_type = dataType;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{1, 3, 1, 3, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{5};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(ShapeTest, OutTypeInt)
+{
+
+  // Run for int32_t output
+  runShapeKernel<int32_t>(loco::DataType::S32, _memory_manager.get());
+  // Run for int64_t output
+  runShapeKernel<int64_t>(loco::DataType::S64, _memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(ShapeTest, Invalid_Output_Type_NEG)
+{
+  Shape input_shape{1, 3};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  ShapeParams params{};
+  params.out_type = loco::DataType::FLOAT32;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SplitV.cpp b/compiler/luci-interpreter/src/kernels/SplitV.cpp
index 281988272..aa6820889 100644
--- a/compiler/luci-interpreter/src/kernels/SplitV.cpp
+++ b/compiler/luci-interpreter/src/kernels/SplitV.cpp
@@ -43,14 +43,36 @@ void SplitV::configure()
   auto sizes_data = getTensorData<int32_t>(size_splits());
 
   assert(size_splits()->shape().num_dims() == 1);
+
+  int32_t sum = 0;
+  const auto num_dims_size_spits = size_splits()->shape().dim(0);
+  int32_t count_neg_dim = 0;
+
+  for (int32_t i = 0; i < num_dims_size_spits - 1; ++i)
+  {
+    if (sizes_data[i] != -1)
+    {
+      sum += sizes_data[i];
+    }
+    else
+    {
+      count_neg_dim++;
+    }
+  }
+  assert(count_neg_dim < 2);
   assert(size_splits()->shape().num_elements() == num_split);
-  assert(std::accumulate(sizes_data, sizes_data + num_split, 0) ==
-         input()->shape().dim(_axis_value));
 
   auto output_shape = input()->shape();
   for (int32_t i = 0; i < num_split; ++i)
   {
-    output_shape.dim(_axis_value) = sizes_data[i];
+    if (sizes_data[i] == -1)
+    {
+      output_shape.dim(_axis_value) = input()->shape().dim(_axis_value) - sum;
+    }
+    else
+    {
+      output_shape.dim(_axis_value) = sizes_data[i];
+    }
     _outputs[i]->resize(output_shape);
   }
 }
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
index c6452cdb0..a8730d861 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
@@ -136,6 +136,11 @@ void StridedSlice::execute() const
                                           getTensorData<uint8_t>(input()), getTensorShape(output()),
                                           getTensorData<uint8_t>(output()));
       break;
+    case DataType::S32:
+      tflite::reference_ops::StridedSlice(op_params, getTensorShape(input()),
+                                          getTensorData<int32_t>(input()), getTensorShape(output()),
+                                          getTensorData<int32_t>(output()));
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index dba39050c..40207090b 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -187,7 +187,7 @@ void GraphLoader::loadTensors()
     const auto *node = loco::must_cast<const luci::CircleNode *>(_graph->nodes()->at(i));
 
     if (node->opcode() == luci::CircleOpcode::CUSTOM && !isSupportedCustomNode(node))
-      throw std::runtime_error("Unknown Custom Node, yet.");
+      throw std::runtime_error("Unsupported Custom operator. " + node->name());
 
     if (!isTensorProducingNode(node))
       continue;
diff --git a/compiler/luci-interpreter/src/loader/nodes/Add.cpp b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
index decccaa1d..501e84752 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Add.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleAdd(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleAdd *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleAdd *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
index 0ee367748..f3ca55744 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleArgMax(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleArgMax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleArgMax *>(circle_node);
   assert(node->arity() == 2);
   const Tensor *input = helper.getInputTensor(node->input());
   const Tensor *axis = helper.getInputTensor(node->dimension());
diff --git a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
index efb011257..a8135706f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleAveragePool2D(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleAveragePool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleAveragePool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
index aae3dbab1..9da2f6d93 100644
--- a/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleBatchMatMul(const luci::CircleNode *circle_node,
                                                        KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleBatchMatMul *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleBatchMatMul *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *lhs = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
index 33d0e2db6..ac6ebb30f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleBatchToSpaceND(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleBatchToSpaceND *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleBatchToSpaceND *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Cast.cpp b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
index 21ea5ceab..a16354c96 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleCast(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleCast *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleCast *>(circle_node);
 
   assert(node->arity() == 1);
 
diff --git a/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
index 7823a9967..ba2564ea2 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleConcatenation(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleConcatenation *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleConcatenation *>(circle_node);
   std::vector<const Tensor *> inputs(node->numValues());
   for (uint32_t i = 0; i < node->numValues(); ++i)
   {
diff --git a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
index b48d97d19..218165e20 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleConv2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleConv2D *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
index 0310fb23f..174946367 100644
--- a/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDepthToSpace(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDepthToSpace *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDepthToSpace *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
index db26ecf2e..8af1e3b58 100644
--- a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDepthwiseConv2D(const luci::CircleNode *circle_node,
                                                            KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
index 4aae56469..787322e9b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDequantize(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDequantize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDequantize *>(circle_node);
 
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/Div.cpp b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
index 56c2e98f2..0611dfdab 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Div.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDiv(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDiv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDiv *>(circle_node);
   assert(node->arity() == 2);
   const Tensor *input1 = helper.getInputTensor(node->x());
   const Tensor *input2 = helper.getInputTensor(node->y());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Elu.cpp b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
index 98ee78be7..a79985e3b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleElu(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleElu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleElu *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Equal.cpp b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
index 649d9bfe9..59692883f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel> build_kernel_CircleEqual(const luci::CircleNode *circle_
                                                  KernelBuilderHelper &helper)
 
 {
-  const auto *node = dynamic_cast<const luci::CircleEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Exp.cpp b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
index 411d142c3..30d11cb89 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleExp(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleExp *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleExp *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Fill.cpp b/compiler/luci-interpreter/src/loader/nodes/Fill.cpp
new file mode 100644
index 000000000..3aefdf1c5
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Fill.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Fill.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFill(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFill *>(circle_node);
+  assert(node->arity() == 2);
+
+  const auto dims = helper.getInputTensor(node->dims());
+  const auto value = helper.getInputTensor(node->value());
+  auto output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Fill>(dims, value, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Floor.cpp b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
index 6d8435f6c..e0a223116 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFloor(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFloor *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFloor *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
index cae2e186e..a45d89e38 100644
--- a/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFloorDiv(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFloorDiv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFloorDiv *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
index 0b8ac44bd..b7b742b8a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFullyConnected *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFullyConnected *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Gather.cpp b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
index 9df9775c5..2ee2906e0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGather(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGather *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGather *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *params = helper.getInputTensor(node->params());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Greater.cpp b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
index 3db11b840..80aa63cf0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGreater(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGreater *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGreater *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
index dbe051d67..272f2843b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGreaterEqual(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGreaterEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGreaterEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/If.cpp b/compiler/luci-interpreter/src/loader/nodes/If.cpp
index 5983f4d3b..3ac7d4941 100644
--- a/compiler/luci-interpreter/src/loader/nodes/If.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/If.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleIf(const luci::CircleNode *circle_node,
                                               KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleIf *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleIf *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
   assert(node->arity() == 1 + node->input_count());
   assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
index 0a8fb85e2..06031e5bc 100644
--- a/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleInstanceNorm(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleInstanceNorm *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleInstanceNorm *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
index 05f920266..6e22e6d4e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleL2Normalize(const luci::CircleNode *circle_node,
                                                        KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleL2Normalize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleL2Normalize *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
index 0e70afafa..95b55896f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleL2Pool2D(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleL2Pool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleL2Pool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
index 7b229ad0e..bbf5067b1 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLeakyRelu(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLeakyRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLeakyRelu *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->features());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/Less.cpp b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
index 81156f275..ae914ecc9 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Less.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLess(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLess *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLess *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
index 82141e5ae..f1b424b55 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLessEqual(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLessEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLessEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
index a12dce0a0..962ca2d7c 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel>
 build_kernel_CircleLocalResponseNormalization(const luci::CircleNode *circle_node,
                                               KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
index 6cf547aae..432204115 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogSoftmax(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogSoftmax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogSoftmax *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->logits());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
index 2c9549f71..bf3cb671a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalAnd(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalAnd *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalAnd *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
index 3d327d6c4..fefcd9a06 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalNot(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalNot *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalNot *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
index 50566bb30..a416cb401 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalOr(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalOr *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalOr *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
index e4160edb3..4a69deef1 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogistic(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogistic *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogistic *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
index 914f22838..f66a206ca 100644
--- a/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMaxPool2D(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMaxPool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMaxPool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
index dc50d6773..d0bff776a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMaximum(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMaximum *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMaximum *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mean.cpp b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
index 97d91207f..0dec63e79 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMean(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMean *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMean *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
index ff659524a..1a49c1090 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMinimum(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMinimum *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMinimum *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
index ebf294583..b221b4574 100644
--- a/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMirrorPad(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMirrorPad *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMirrorPad *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mul.cpp b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
index 4f9da967d..f9984853a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMul(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMul *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMul *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Neg.cpp b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
index 23c00537b..9a9ecf991 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleNeg(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleNeg *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleNeg *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
index 8e5711fc1..3916a5854 100644
--- a/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleNotEqual(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleNotEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleNotEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
index e31601bf6..f3d700c95 100644
--- a/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePRelu(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePRelu *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pack.cpp b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
index 699472081..efc5850e0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePack(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePack *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePack *>(circle_node);
   assert(node->arity() == node->values_count());
 
   std::vector<const Tensor *> inputs(node->values_count());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pad.cpp b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
index 770549295..67ce997a7 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePad(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePad *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePad *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
index 12deb15f0..e378a972a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePadV2(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePadV2 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePadV2 *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pow.cpp b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
index b430bc94f..d32fc3dbb 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePow(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePow *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePow *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
index fd9836345..cb36fb6da 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
@@ -24,9 +24,8 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleQuantize(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleQuantize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleQuantize *>(circle_node);
+  assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp b/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp
new file mode 100644
index 000000000..1a8522dd6
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ReduceMax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReduceMax(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleReduceMax *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axes = helper.getInputTensor(node->reduction_indices());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto temp_index_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  temp_index_unique->set_observable(false);
+  temp_index_unique->set_data_buffer(nullptr);
+  Tensor *temp_index =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_index_unique));
+
+  auto resolved_axes_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  resolved_axes_unique->set_observable(false);
+  resolved_axes_unique->set_data_buffer(nullptr);
+  Tensor *resolved_axes =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(resolved_axes_unique));
+
+  ReducerParams params{};
+  params.keep_dims = node->keep_dims();
+
+  return std::make_unique<kernels::ReduceMax>(input, axes, output, temp_index, resolved_axes,
+                                              params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
index d53a66a06..1d64c1c4e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRelu(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRelu *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
index f1b5d219b..e50cd2545 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRelu6(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRelu6 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRelu6 *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
index 89e3ecebf..76ddd88a3 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleReshape(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleReshape *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleReshape *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->tensor());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
index dca56588d..dc2b88ad3 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleResizeBilinear(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleResizeBilinear *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleResizeBilinear *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
index d1ea19c0f..c7058ae78 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel>
 build_kernel_CircleResizeNearestNeighbor(const luci::CircleNode *circle_node,
                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
index ea00f5408..c1a7f5350 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleReverseV2(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleReverseV2 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleReverseV2 *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->tensor());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
index ff87f435c..0714a5dba 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRsqrt(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRsqrt *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRsqrt *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
index 89528d5ee..d172ef438 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
@@ -24,9 +24,8 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSVDF(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSVDF *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSVDF *>(circle_node);
+  assert(node->arity() == 5);
 
   const Tensor *input = helper.getInputTensor(node->input());
   const Tensor *feature = helper.getInputTensor(node->weight_feature());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Shape.cpp b/compiler/luci-interpreter/src/loader/nodes/Shape.cpp
new file mode 100644
index 000000000..d1edbc794
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Shape.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Shape.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleShape(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleShape *>(circle_node);
+  assert(node->arity() == 1);
+
+  const auto input = helper.getInputTensor(node->input());
+  auto output = helper.getOutputTensor(node);
+
+  ShapeParams shape_params{};
+  shape_params.out_type = node->out_type();
+
+  return std::make_unique<kernels::ShapeKernel>(input, output, shape_params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Slice.cpp b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
index 741cd0806..60ac6417c 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSlice(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSlice *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSlice *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
index b15e4b6f3..f41f63f6f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSoftmax(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSoftmax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSoftmax *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->logits());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
index 91c237aa5..b6e6cf516 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSpaceToBatchND(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSpaceToBatchND *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSpaceToBatchND *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
index 3cbbd9718..63fdb95ec 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSpaceToDepth(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSpaceToDepth *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSpaceToDepth *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->input());
 
diff --git a/compiler/luci-interpreter/src/loader/nodes/Split.cpp b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
index 32553ad5e..3f6d4a7df 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Split.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSplit(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSplit *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSplit *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleSplitOut>(node);
   assert(node->arity() == 2);
   assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
index d78816447..0788822ca 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSplitV(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSplitV *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSplitV *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleSplitVOut>(node);
   assert(node->arity() == 3);
   assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
index 56dd986f1..b9843fe0b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSqrt(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSqrt *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSqrt *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Square.cpp b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
index 43aadb969..0ad7c1772 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Square.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSquare(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSquare *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSquare *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
index 6a2717aa2..e4c6fd851 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSquaredDifference(const luci::CircleNode *circle_node,
                                                              KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSquaredDifference *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSquaredDifference *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
index 583ff9314..6885f8077 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSqueeze(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSqueeze *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSqueeze *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
index fe5fa7707..359b4e3e9 100644
--- a/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleStridedSlice(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleStridedSlice *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleStridedSlice *>(circle_node);
   assert(node->arity() == 4);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sub.cpp b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
index bad4fbb13..a6252cb53 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSub(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSub *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSub *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
index f4255291b..a58ef60a8 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTanh(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTanh *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTanh *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
index 4e095fbbc..ea17d8311 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTranspose(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTranspose *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTranspose *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->a());
diff --git a/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
index 1b954c35c..d773e301e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTransposeConv(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTransposeConv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTransposeConv *>(circle_node);
   assert(node->arity() == 4);
 
   const Tensor *input_sizes = helper.getInputTensor(node->inputSizes());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
index 978c738c6..a1c0d323a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleUnpack(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleUnpack *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleUnpack *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleUnpackOut>(node);
   assert(node->arity() == 1);
   assert(output_nodes.size() == static_cast<size_t>(node->num()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/While.cpp b/compiler/luci-interpreter/src/loader/nodes/While.cpp
index 284dc0c68..8fde6ec8a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/While.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/While.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleWhile(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleWhile *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleWhile *>(circle_node);
 
   auto output_nodes = collectOutputNodes<luci::CircleWhileOut>(node);
   assert(node->arity() == node->input_count());
diff --git a/compiler/luci-micro/CMakeLists.txt b/compiler/luci-micro/CMakeLists.txt
index c8a2e12e1..642cf14a3 100644
--- a/compiler/luci-micro/CMakeLists.txt
+++ b/compiler/luci-micro/CMakeLists.txt
@@ -15,7 +15,7 @@ set(CMAKE_ARM_OPTIONS
   -DLUCI_STATIC=ON
   -DBUILD_CMSIS_NN_FUNCTIONS=ON
   -DTARGET_CPU=cortex-m7
-  "-DCMAKE_TOOLCHAIN_FILE=${NNAS_PROJECT_SOURCE_DIR}/infra/nncc/cmake/buildtool/config/arm-non-eabi-gcc.cmake"
+  "-DCMAKE_TOOLCHAIN_FILE=${NNAS_PROJECT_SOURCE_DIR}/infra/nncc/cmake/buildtool/config/arm-none-eabi-gcc.cmake"
   "-DLUCI_INTERPRETER_PAL_DIR=${CMAKE_CURRENT_SOURCE_DIR}/../luci-interpreter/pal/mcu"
   "-DNNAS_PROJECT_SOURCE_DIR=${NNAS_PROJECT_SOURCE_DIR}"
   "-DNNAS_EXTERNALS_DIR=${NNAS_EXTERNALS_DIR}"
diff --git a/compiler/luci-micro/luci-interpreter/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/CMakeLists.txt
new file mode 100644
index 000000000..1f7acee87
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LUCI_INTERPRETER_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
+set(LUCI_INTERPRETER_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
+if (NOT LUCI_INTERPRETER_PAL_DIR)
+    set(LUCI_INTERPRETER_PAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pal/linux")
+endif()
+
+set(KERNEL_REGISTER_FILE ${LUCI_INTERPRETER_PAL_DIR}/KernelsToBuild.lst)
+
+if (NOT DEFINED CUSTOM_LUCI_INTERPRETER_SUFFIX)
+    set(LUCI_INTERPRETER_SUFFIX "")
+else()
+    set(LUCI_INTERPRETER_SUFFIX ${CUSTOM_LUCI_INTERPRETER_SUFFIX})
+endif()
+
+add_subdirectory(src)
diff --git a/compiler/luci-micro/luci-interpreter/README.md b/compiler/luci-micro/luci-interpreter/README.md
new file mode 100644
index 000000000..77ec5c81c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/README.md
@@ -0,0 +1,158 @@
+# luci-interpreter
+
+`luci-interpreter` is an inference engine for neural networks represented in luci IR.
+See `compiler/luci/lang` directory for details about IR.
+You can find useful infrastructure, like importer/exporter, optimizations in `compiler/luci`.
+
+`luci-interpreter` provides:
+- Basic inference functionality, input setters and output getters
+- Interface for inspecting hidden interpreter state, like activation values during inference
+- Customization mechanisms to fit the interpreter to specific platforms, like MCUs
+
+Public interface headers are placed in `luci-interpreter/include/luci_interpreter` directory
+
+## Basic usage
+
+Minimal usage includes:
+- Setting input data
+- Running inference
+- Fetching inference results
+
+Interpreter object is reusable and can run multiple inferences.
+Elements in tensors (input/output/internal) are stored contiguously and have C-like layout:
+This means for tensor t=[[0, 1],[2, 3]], t[0,1] == 1.
+
+Input and output tensors have the same indexes as in original luci model. 
+
+**Usage example:**
+``` c++
+// Note getTensorSize is a function that computes tensor size,
+// it is not part of interpreter and should be implemented by user 
+
+luci_interpreter::Interpreter interpreter(luci_module);
+
+// Set inputs
+// assuming model has only one input and one output
+const auto input_nodes = loco::input_nodes(module->graph());
+
+const auto *input_node = dynamic_cast<const luci::CircleInput *>(input_nodes[0]);
+std::vector<char> input_data(getTensorSize(input_node));
+// Initialize input data here
+
+interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+
+// Start inference
+interpreter.interpret();
+
+// Fetch inference results
+const auto output_nodes = loco::output_nodes(module->graph());
+const auto *output_node = dynamic_cast<const luci::CircleOutput *>(output_nodes[0]);
+std::vector<char> output_data(getTensorSize(output_node));
+interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
+```
+
+## Inspecting intermediate state
+
+Interpreter provides interfaces to investigate internal state of interpreter during inference.
+
+This is done by "observer" mechanism:
+- `Interpreter` class has `attachObserver` method, which takes pointer to `ExecutionObserver` object
+- `ExecutionObserver` defines several callback methods user can override to inject custom code
+
+ExecutionObserver provides three callbacks:
+- `postTensorWrite` checks contents of output tensor after operation execution
+- `preOperatorExecute` notifies that interpreter is going to execute operation
+- `postOperatorExecute` notifies that interpreter has finished execution of an operation
+
+See `luci-interpreter/include/luci_interpreter/Interpreter.h` for this interface details.
+
+**Usage example:**
+``` c++
+class CustomExecutionObserver: public luci_interpreter::ExecutionObserver
+{
+public:
+  void postTensorWrite(const luci::CircleNode *node, const Tensor *tensor) override
+  {
+    if (tensor->element_type() != loco::DataType::FLOAT32)
+      return;
+    for (int i = 0; i < tensor->shape().num_elements(); ++i)
+      std::cout << tensor->data<float>[i] << ", ";
+  }
+
+  // User observer can override only needed methods,
+  // others will inherit empty implementation from base observer.
+
+  // void preOperatorExecute(const luci::CircleNode *node);
+  // void postOperatorExecute(const luci::CircleNode *node);
+};
+
+luci_interpreter::Interpreter interpreter(module);
+CustomExecutionObserver observer;
+interpreter.attachObserver(&observer);
+
+// initialize input_data
+interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+
+interpreter.interpret();
+```
+
+## Customizing inference
+
+### Memory manager
+
+Interpreter provides a handle for altering default memory management mechanisms.
+
+This is done by `MemoryManger` interface, see `luci-interpreter/include/luci_interpreter/MemoryManager.h` for implementation details.
+
+This header contains `IMemoryManager` abstract class which is responsible for allocation and dealocation of tensors' memory.
+
+User can construct an interpreter with one of predefined memory managers or their own custom memory manager.
+Note that one memory manager could be shared between multiple interpreter instances, because an interpreter does not own the manager object. 
+
+List of predefined memory managers:
+- `SimpleMemoryManager` This is a simple wrapper around new/delete, default one.
+- `TestMemoryManager` Memorizes all allocated memory and releases it in Manager destructor, used in kernel unit tests.
+- `BuddyMemoryManager` Implements Buddy algorithm, uses external buffer for tensor data allocations, does not need new/delete.
+- `StaticMemoryManger` Uses precomputed memory allocation plan. Requires preparation with MemoryPlanner, but could reduce memory consumption in restricted environments (like MCUs).
+
+**SimpleMemoryManager usage example:**
+
+No need to select anything, to use this memory manager.
+``` c++
+luci_interpreter::Interpreter interpreter(module);
+```
+
+**TestMemoryManager usage example:**
+
+``` c++
+luci_interpreter::TestMemoryManager mm;
+luci_interpreter::Interpreter interpreter(module, &mm);
+```
+
+**BuddyMemoryManager usage example:**
+
+`BuddyMemoryManager` implements a classic allocation algorithm: https://en.wikipedia.org/wiki/Buddy_memory_allocation.
+
+This allocator uses an external buffer as a memory pool. That allows to use static memory arrays for allocations.
+
+Limitations
+- Current implementation uses only lower power-of-two bytes of given buffer.
+
+  For example for 1000 bytes buffer, only lower 512 bytes will be used.
+- Current implementation can handle maximum 4 gigabyte memory pool
+
+``` c++
+  constexpr int buffer_size = 2048;
+  static uint8_t buffer[buffer_size];
+  luci_interpreter::BuddyMemoryManager memory_manager(buffer, buffer_size);
+  luci_interpreter::Interpreter interpreter(module.get(), &memory_manager);
+```
+
+**StaticMemoryManager usage example:**
+``` c++
+TBD when it is merged
+```
+
+## Further reading
+
+If you want to participate in development, please read `DEVELOPER.md` for SW architecture details.
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h
new file mode 100644
index 000000000..205baa626
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/MemoryManager.h"
+
+#ifndef LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
+
+namespace luci_interpreter
+{
+
+class BuddyMemoryManager : public IMemoryManager
+{
+public:
+  BuddyMemoryManager(uint8_t *memory_start, int32_t memSize);
+
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+private:
+  struct Block
+  {
+    Block *next_free;
+    bool is_free;
+    uint32_t size;
+    // debug field
+    Block *self;
+  };
+
+  Block *_start_block;
+  int32_t _num_blocks;
+  uint32_t _size;
+  Block *_free_blocks[32]{};
+
+  static int32_t lowerLog2(uint32_t val)
+  {
+    int32_t i = 0;
+    while (val >>= 1)
+      i++;
+
+    return i;
+  }
+
+  void addToBlocks(Block *block, int32_t l)
+  {
+    if (!block)
+      return;
+
+    block->next_free = _free_blocks[l];
+    _free_blocks[l] = block;
+  }
+
+  void removeFromBlocks(const Block *block, int32_t l)
+  {
+    if (!block)
+      return;
+
+    Block *tmp = _free_blocks[l];
+
+    if (block == tmp)
+    {
+      _free_blocks[l] = block->next_free;
+      return;
+    }
+
+    while (tmp)
+    {
+      if (tmp->next_free == block)
+      {
+        tmp->next_free = block->next_free;
+        return;
+      }
+
+      tmp = tmp->next_free;
+    }
+  }
+
+  void divideBlock(Block *block, int32_t l)
+  {
+    int32_t size = ((block->size + sizeof(Block)) / 2) - sizeof(Block);
+
+    removeFromBlocks(block, l);
+
+    // there is no need to add to the free_blocks list here
+    block->is_free = true;
+    block->size = size;
+    block->self = block;
+
+    Block *buddy;
+    buddy = (Block *)((uint8_t *)block + sizeof(Block) + size);
+    buddy->is_free = true;
+    buddy->size = size;
+    buddy->self = buddy;
+
+    addToBlocks(buddy, l - 1);
+  }
+
+  Block *mergeBlock(Block *block)
+  {
+    Block *buddy;
+
+    const int32_t l = lowerLog2(block->size + sizeof(Block));
+
+    const int64_t address = ((uint8_t *)block - (uint8_t *)_start_block);
+    buddy = (Block *)((address ^ (1 << l)) + (uint8_t *)_start_block);
+
+    if (!buddy->is_free || buddy->size != block->size)
+      return nullptr;
+
+    if (block > buddy)
+    {
+      Block *x = block;
+      block = buddy;
+      buddy = x;
+    }
+
+    removeFromBlocks(block, l);
+    removeFromBlocks(buddy, l);
+
+    block->size = block->size * 2 + sizeof(Block);
+    block->is_free = true;
+    block->self = block;
+
+    addToBlocks(block, l + 1);
+
+    return block;
+  }
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h
new file mode 100644
index 000000000..375b1ae20
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__
+#define __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__
+
+#include <luci/Import/GraphBuilderRegistry.h>
+
+namespace luci_interpreter
+{
+
+/**
+ * @brief Creates and returns GraphBuilderSource, which allows to not copy constant buffers from
+ * model's file.
+ *
+ * @warning Use this source only in case when model's buffer alive longer than Interpreter.
+ */
+std::unique_ptr<luci::GraphBuilderSource> source_without_constant_copying();
+
+} // namespace luci_interpreter
+
+#endif // __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/Interpreter.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/Interpreter.h
new file mode 100644
index 000000000..8e2f457a5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/Interpreter.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_INTERPRETER_H
+#define LUCI_INTERPRETER_INTERPRETER_H
+
+#include "luci_interpreter/core/Tensor.h"
+
+#include <luci/IR/Nodes/CircleInput.h>
+#include <luci/IR/Nodes/CircleOutput.h>
+
+#include "luci_interpreter/MemoryManager.h"
+#include <luci/IR/Module.h>
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class ExecutionObserver
+{
+public:
+  virtual ~ExecutionObserver();
+
+  // Called when the value of a tensor has been updated during execution.
+  virtual void postTensorWrite(const luci::CircleNode *node, const Tensor *tensor);
+
+  // Called before / after executing an operator.
+  // Note that these methods are not called for auxiliary operators (CircleInput, CircleOutput,
+  // CircleConst and Circle*Out).
+  virtual void preOperatorExecute(const luci::CircleNode *node);
+  virtual void postOperatorExecute(const luci::CircleNode *node);
+};
+
+class Interpreter
+{
+public:
+  explicit Interpreter(const luci::Module *module);
+
+  explicit Interpreter(const luci::Module *module, IMemoryManager *memory_manager);
+
+  ~Interpreter();
+
+  void writeInputTensor(const luci::CircleInput *input_node, const void *data, size_t data_size);
+
+  void readOutputTensor(const luci::CircleOutput *output_node, void *data, size_t data_size);
+
+  void interpret();
+
+  void attachObserver(ExecutionObserver *observer);
+
+  const Tensor *getTensor(const loco::Node *node) { return _node_to_tensor[node]; }
+
+private:
+  // _default_memory_manager should be before _runtime_module due to
+  // the order of deletion in the destructor
+  std::unique_ptr<IMemoryManager> _default_memory_manager = nullptr;
+  std::unique_ptr<class RuntimeModule> _runtime_module;
+
+  // Observer functionality support.
+  std::unique_ptr<struct RuntimeToIR> _runtime_to_ir;
+  std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
+  std::unique_ptr<class EventNotifier> _event_notifier;
+  std::vector<ExecutionObserver *> _observers;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_INTERPRETER_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/MemoryManager.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/MemoryManager.h
new file mode 100644
index 000000000..f32c52095
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/MemoryManager.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_MEMORY_MANAGER_H
+
+#include "luci_interpreter/core/DataType.h"
+#include "luci_interpreter/core/Tensor.h"
+
+namespace luci_interpreter
+{
+
+class IMemoryManager
+{
+public:
+  virtual void allocate_memory(luci_interpreter::Tensor &tensor) = 0;
+  virtual void release_memory(luci_interpreter::Tensor &tensor) = 0;
+
+  virtual ~IMemoryManager() = default;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_MEMORY_MANAGER_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h
new file mode 100644
index 000000000..658a1c609
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+
+class SimpleMemoryManager : public IMemoryManager
+{
+public:
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h
new file mode 100644
index 000000000..ded7bde79
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+
+// Used for allocations in static buffer, using offsets defined in luci model.
+class StaticMemoryManager : public IMemoryManager
+{
+public:
+  StaticMemoryManager() = delete;
+
+  explicit StaticMemoryManager(uint8_t *buffer_ptr) : _buffer_ptr(buffer_ptr)
+  { /* Do nothing */
+  }
+
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+private:
+  // Stores a pointer to the beginning of the allocated memory buffer.
+  uint8_t *_buffer_ptr;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/TestMemoryManager.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/TestMemoryManager.h
new file mode 100644
index 000000000..397bbed76
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/TestMemoryManager.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+// Memory Manager for using in kernels tests. This eliminates the need to manually delete the
+// allocated memory in tests. This mem_manager remembers all its allocations and in destructor
+// delete all allocations.
+class TestMemoryManager : public IMemoryManager
+{
+public:
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+  ~TestMemoryManager() override
+  {
+    for (auto allocation : allocations)
+    {
+      delete[] allocation;
+    }
+  }
+
+private:
+  std::vector<uint8_t *> allocations;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
diff --git a/compiler/circledump/include/circleread/Model.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/core/DataType.h
index 234db8b4c..27bf719b5 100644
--- a/compiler/circledump/include/circleread/Model.h
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/core/DataType.h
@@ -14,30 +14,23 @@
  * limitations under the License.
  */
 
-#ifndef __CIRCLEREAD_MODEL_H__
-#define __CIRCLEREAD_MODEL_H__
+#ifndef LUCI_INTERPRETER_CORE_DATATYPE_H
+#define LUCI_INTERPRETER_CORE_DATATYPE_H
 
-#include <mio/circle/schema_generated.h>
+#include <loco/IR/DataType.h>
+#include <loco/IR/DataTypeTraits.h>
 
-#include <memory>
+#include <cstddef>
 
-namespace circleread
+namespace luci_interpreter
 {
 
-struct Model
-{
-  virtual ~Model() = default;
+using DataType = loco::DataType;
 
-  virtual const ::circle::Model *model(void) const = 0;
-};
+template <DataType DT> using DataTypeImpl = loco::DataTypeImpl<DT>;
 
-/**
- * @brief Load Circle model (as a raw Model) from a given path
- *
- * @note May return a nullptr
- */
-std::unique_ptr<Model> load_circle(const std::string &path);
+inline size_t getDataTypeSize(DataType data_type) { return loco::size(data_type); }
 
-} // namespace circleread
+} // namespace luci_interpreter
 
-#endif // __CIRCLEREAD_MODEL_H__
+#endif // LUCI_INTERPRETER_CORE_DATATYPE_H
diff --git a/compiler/luci-micro/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/core/Tensor.h
new file mode 100644
index 000000000..bb9ff6d4a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_TENSOR_H
+#define LUCI_INTERPRETER_CORE_TENSOR_H
+
+#include "luci_interpreter/core/DataType.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace luci_interpreter
+{
+
+class Shape
+{
+public:
+  explicit Shape(int rank) : _dims(rank, 0) {}
+
+  Shape(std::initializer_list<int32_t> dims) : _dims(dims.begin(), dims.end()) {}
+
+  int num_dims() const { return _dims.size(); }
+
+  int32_t dim(int i) const
+  {
+    assert(i >= 0 && i < static_cast<int>(_dims.size()));
+    return _dims[i];
+  }
+
+  int32_t &dim(int i)
+  {
+    assert(i >= 0 && i < static_cast<int>(_dims.size()));
+    return _dims[i];
+  }
+
+  int32_t num_elements() const
+  {
+    int32_t result = 1;
+    for (const int32_t dim : _dims)
+    {
+      result *= dim;
+    }
+    return result;
+  }
+
+  bool operator==(const Shape &other) const { return _dims == other._dims; }
+
+  bool operator!=(const Shape &other) const { return !operator==(other); }
+
+private:
+  std::vector<int32_t> _dims;
+};
+
+// Tensor affine quantization parameters.
+//
+// The relationship between real and quantized values:
+//   real_value = (quantized_value - zero_point) * scale
+//
+// In per-tensor case, 'scale' and 'zero_point' are one element each.
+// In per-channel case, 'scale' and 'zero_point' are N elements each, where N is the size
+// of the quantized dimension.
+//
+// Note that due to historical and performance reasons, per-tensor quantization uses unsigned
+// integer types, while per-channel uses signed types assuming 'zero_point' == 0.
+struct AffineQuantization
+{
+  std::vector<float> scale;
+  std::vector<int32_t> zero_point;
+  int32_t quantized_dimension;
+};
+
+class Tensor
+{
+public:
+  Tensor(DataType element_type, Shape shape, AffineQuantization quantization, std::string name);
+
+  DataType element_type() const { return _element_type; }
+
+  const Shape &shape() const { return _shape; }
+
+  float scale() const
+  {
+    assert(_quantization.scale.size() == 1);
+    return _quantization.scale[0];
+  }
+
+  int32_t zero_point() const
+  {
+    assert(_quantization.zero_point.size() == 1);
+    return _quantization.zero_point[0];
+  }
+
+  const std::vector<float> &scales() const { return _quantization.scale; }
+
+  const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
+
+  int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
+
+  template <typename T> const T *data() const
+  {
+    static_assert(std::is_same<uint8_t, char>::value or
+                  std::is_same<uint8_t, unsigned char>::value);
+    return reinterpret_cast<const T *>(_data);
+  }
+
+  template <typename T> T *data()
+  {
+    static_assert(std::is_same<uint8_t, char>::value or
+                  std::is_same<uint8_t, unsigned char>::value);
+    return reinterpret_cast<T *>(_data);
+  }
+
+  const std::string &name() const { return _name; }
+
+  void readData(void *data_ptr, size_t data_size) const;
+
+  void writeData(const void *data_ptr, size_t data_size);
+
+  void resize(const Shape &new_shape);
+
+  void set_data_buffer(uint8_t *buffer)
+  {
+    if (buffer == nullptr)
+    {
+      _data_allocated = false;
+    }
+    else
+    {
+      _data_allocated = true;
+    }
+    _data = buffer;
+  }
+
+  bool is_observable() const { return _is_observable; }
+
+  void set_observable(bool value) { _is_observable = value; }
+
+  bool is_allocatable() const { return _is_allocatable; }
+
+  void set_allocatable(bool value) { _is_allocatable = value; }
+
+  bool is_data_allocated() const { return _data_allocated; }
+
+  int32_t get_offset() const { return _offset; }
+
+  void set_offset(int32_t offset) { _offset = offset; }
+
+private:
+  DataType _element_type;
+  Shape _shape;
+  AffineQuantization _quantization;
+  uint8_t *_data;
+  std::string _name;
+  bool _data_allocated;
+  // Write of tensor is reported to registered Observers only if this tensor is observable
+  // This is needed for tensors used in kernel implementation, but not present in original model.
+  bool _is_observable = true;
+  // Memory manager is called for tensor only if it is "allocatable".
+  // Kernel configuration could disable allocation of some tensors if they are not needed for
+  // particular operation.
+  bool _is_allocatable = true;
+  // Used by static memory manager.
+  // Stores the offset from the beginning of the allocated memory buffer.
+  int32_t _offset = -1;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_TENSOR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
new file mode 100644
index 000000000..f0df58db3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -0,0 +1,62 @@
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Dequantize)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(SVDF)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALArgMax.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+                             const T2 *axis, const tflite::RuntimeShape &output_shape,
+                             T3 *output_data, const std::greater<T1> cmp)
+{
+  tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h
new file mode 100644
index 000000000..a274afb7e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+#include <arm_nn_types.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void AveragePool(const tflite::PoolParams &params,
+                               const tflite::RuntimeShape &input_shape, const T *input_data,
+                               const tflite::RuntimeShape &output_shape, T *output_data,
+                               const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation is not supported
+    assert(false && "AveragePool NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void AveragePool<int8_t>(const tflite::PoolParams &params,
+                                const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                                const tflite::RuntimeShape &output_shape, int8_t *output_data,
+                                const tflite::RuntimeShape &scratchpad_shape,
+                                int8_t *scratchpad_data)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(scratchpad_data != nullptr);
+
+  const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+  assert(batches == 1);
+
+  const int depth = tflite::MatchingDim(input_shape, 3, output_shape, 3);
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = 1;
+  input_dims.h = input_shape.Dims(1);
+  input_dims.w = input_shape.Dims(2);
+  input_dims.c = depth;
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = 1;
+  output_dims.h = output_shape.Dims(1);
+  output_dims.w = output_shape.Dims(2);
+  output_dims.c = depth;
+
+  cmsis_nn_pool_params pool_params;
+  pool_params.stride.h = params.stride_height;
+  pool_params.stride.w = params.stride_width;
+  pool_params.padding.h = params.padding_values.height;
+  pool_params.padding.w = params.padding_values.width;
+  pool_params.activation.min = params.quantized_activation_min;
+  pool_params.activation.max = params.quantized_activation_max;
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = 1;
+  filter_dims.h = params.filter_height;
+  filter_dims.w = params.filter_width;
+  filter_dims.c = 1;
+
+  cmsis_nn_context ctx;
+  ctx.buf = scratchpad_data;
+  ctx.size = scratchpad_shape.Dims(0);
+  auto res = arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims, &output_dims,
+                            output_data);
+  assert(res == ARM_MATH_SUCCESS);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &output_shape)
+
+{
+  if (input_data_type == luci_interpreter::DataType::S8)
+  {
+    assert(input_shape.DimensionsCount() == 4);
+    assert(output_shape.DimensionsCount() == 4);
+
+    const int32_t output_width = output_shape.Dims(2);
+    const int32_t depth = tflite::MatchingDim(input_shape, 3, output_shape, 3);
+
+    const int32_t buf_size = arm_avgpool_s8_get_buffer_size(output_width, depth);
+    auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type));
+
+    luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size};
+    scratchpad->resize(scratchpad_shape);
+  }
+  else
+  {
+    scratchpad->set_allocatable(false);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALBatchToSpaceND.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALBatchToSpaceND.h
new file mode 100644
index 000000000..4dd77ffdc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::BatchToSpaceND(
+    unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALConv2d.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALConv2d.h
new file mode 100644
index 000000000..cfb84ea60
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALConv2d.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/conv.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+#include <arm_nn_types.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const float *input_data, const tflite::RuntimeShape &filter_shape,
+                        const float *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const float *bias_data, const tflite::RuntimeShape &output_shape,
+                        float *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        float *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data,
+                              tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+                        const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                        uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        uint8 *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data, scratchpad_shape,
+                              scratchpad_data, nullptr);
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+                                  const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                                  const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+                                  const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                                  const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                                  int8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                                  int8 *scratchpad_data)
+{
+  if (scratchpad_data)
+  {
+    cmsis_nn_conv_params conv_params;
+    conv_params.dilation.h = params.dilation_height_factor;
+    conv_params.dilation.w = params.dilation_width_factor;
+
+    assert(conv_params.dilation.h == 1);
+    assert(conv_params.dilation.w == 1);
+
+    conv_params.input_offset = params.input_offset;
+    conv_params.output_offset = params.output_offset;
+    conv_params.stride.h = params.stride_height;
+    conv_params.stride.w = params.stride_width;
+    conv_params.padding.h = params.padding_values.height;
+    conv_params.padding.w = params.padding_values.width;
+    conv_params.activation.min = params.quantized_activation_min;
+    conv_params.activation.max = params.quantized_activation_max;
+
+    cmsis_nn_per_channel_quant_params quant_params;
+    quant_params.multiplier = const_cast<int32_t *>(mult);
+    quant_params.shift = const_cast<int32_t *>(shifts);
+
+    assert(conv_params.activation.min <= conv_params.activation.max);
+    assert(input_shape.DimensionsCount() == 4);
+    assert(filter_shape.DimensionsCount() == 4);
+    assert(output_shape.DimensionsCount() == 4);
+    const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3);
+    const int output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3);
+    if (bias_data)
+    {
+      assert(bias_shape.FlatSize() == output_depth);
+    }
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_depth;
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = output_depth;
+    filter_dims.h = filter_shape.Dims(1);
+    filter_dims.w = filter_shape.Dims(2);
+    filter_dims.c = input_depth;
+
+    cmsis_nn_dims bias_dims;
+    bias_dims.n = 1;
+    bias_dims.h = 1;
+    bias_dims.w = 1;
+    bias_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+    cmsis_nn_context ctx;
+    ctx.buf = scratchpad_data;
+    ctx.size = scratchpad_shape.Dims(0);
+
+    auto res = arm_convolve_wrapper_s8(&ctx, &conv_params, &quant_params, &input_dims, input_data,
+                                       &filter_dims, filter_data, &bias_dims, bias_data,
+                                       &output_dims, output_data);
+    assert(res == ARM_MATH_SUCCESS);
+  }
+  else
+  {
+    tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+                                                  filter_shape, filter_data, bias_shape, bias_data,
+                                                  output_shape, output_data);
+  }
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::ConvParams &params,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+{
+  cmsis_nn_conv_params conv_params;
+  conv_params.dilation.h = params.dilation_height_factor;
+  conv_params.dilation.w = params.dilation_width_factor;
+
+  if (input_data_type == loco::DataType::S8 && conv_params.dilation.h == 1 &&
+      conv_params.dilation.w == 1)
+  {
+    const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3);
+    const int32_t output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3);
+    const int32_t filter_height = filter_shape.Dims(1);
+    const int32_t filter_width = filter_shape.Dims(2);
+    const int32_t output_height = output_shape.Dims(1);
+    const int32_t output_width = output_shape.Dims(2);
+
+    conv_params.input_offset = params.input_offset;
+    conv_params.output_offset = params.output_offset;
+    conv_params.stride.h = params.stride_height;
+    conv_params.stride.w = params.stride_width;
+    conv_params.padding.h = params.padding_values.height;
+    conv_params.padding.w = params.padding_values.width;
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batches;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_depth;
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = output_depth;
+    filter_dims.h = filter_height;
+    filter_dims.w = filter_width;
+    filter_dims.c = input_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batches;
+    output_dims.h = output_height;
+    output_dims.w = output_width;
+    output_dims.c = output_depth;
+
+    const int32_t buf_size = arm_convolve_wrapper_s8_get_buffer_size(&conv_params, &input_dims,
+                                                                     &filter_dims, &output_dims);
+
+    luci_interpreter::Shape scratchpad_shape{buf_size};
+    scratchpad->resize(scratchpad_shape);
+  }
+  else
+  {
+    scratchpad->set_allocatable(false);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthToSpace.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthToSpace.h
new file mode 100644
index 000000000..8463e571e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/reference/depth_to_space.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h
new file mode 100644
index 000000000..120dcd803
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+DepthwiseConvPerChannel(const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+                        const int32_t *output_shift, const tflite::RuntimeShape &input_shape,
+                        const T *input_data, const tflite::RuntimeShape &filter_shape,
+                        const T *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32_t *bias_data, const tflite::RuntimeShape &output_shape,
+                        T *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation is not supported
+    assert(false && "DepthwiseConvPerChannel NYI");
+    (void)params;
+    (void)output_multiplier;
+    (void)output_shift;
+    (void)input_shape;
+    (void)output_data;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void DepthwiseConvPerChannel<int8_t>(
+  const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+  const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+  const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+  const tflite::RuntimeShape &output_shape, int8_t *output_data,
+  const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
+{
+  if (scratchpad_data)
+  {
+    cmsis_nn_dw_conv_params dw_conv_params;
+    dw_conv_params.dilation.h = params.dilation_height_factor;
+    dw_conv_params.dilation.w = params.dilation_width_factor;
+    assert(dw_conv_params.dilation.h == 1);
+    assert(dw_conv_params.dilation.w == 1);
+
+    dw_conv_params.input_offset = params.input_offset;
+    dw_conv_params.output_offset = params.output_offset;
+    dw_conv_params.stride.h = params.stride_height;
+    dw_conv_params.stride.w = params.stride_width;
+    dw_conv_params.padding.h = params.padding_values.height;
+    dw_conv_params.padding.w = params.padding_values.width;
+
+    dw_conv_params.activation.min = params.quantized_activation_min;
+    dw_conv_params.activation.max = params.quantized_activation_max;
+    dw_conv_params.ch_mult = params.depth_multiplier;
+
+    cmsis_nn_per_channel_quant_params quant_params;
+    int32_t output_multiplier = params.output_multiplier;
+    int32_t output_shift = params.output_shift;
+
+    quant_params.multiplier = &output_multiplier;
+    quant_params.shift = &output_shift;
+
+    assert(dw_conv_params.activation.min <= dw_conv_params.activation.max);
+    const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3);
+    if (bias_data)
+    {
+      assert(bias_shape.FlatSize() == output_depth);
+    }
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_shape.Dims(3);
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = filter_shape.Dims(0);
+    filter_dims.h = filter_shape.Dims(1);
+    filter_dims.w = filter_shape.Dims(2);
+    filter_dims.c = output_depth;
+
+    cmsis_nn_dims bias_dims;
+    bias_dims.n = 1;
+    bias_dims.h = 1;
+    bias_dims.w = 1;
+    bias_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+    cmsis_nn_context ctx;
+    ctx.buf = scratchpad_data;
+    ctx.size = scratchpad_shape.Dims(0);
+
+    auto res = arm_depthwise_conv_wrapper_s8(&ctx, &dw_conv_params, &quant_params, &input_dims,
+                                             input_data, &filter_dims, filter_data, &bias_dims,
+                                             bias_data, &output_dims, output_data);
+    assert(res == ARM_MATH_SUCCESS);
+  }
+  else
+  {
+    tflite::reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data);
+  }
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const tflite::DepthwiseParams &params,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+{
+  cmsis_nn_dw_conv_params dw_conv_params;
+  dw_conv_params.dilation.h = params.dilation_height_factor;
+  dw_conv_params.dilation.w = params.dilation_width_factor;
+
+  if (input_data_type == loco::DataType::S8 && dw_conv_params.dilation.h == 1 &&
+      dw_conv_params.dilation.w == 1)
+  {
+    const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3);
+
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_shape.Dims(3);
+
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = filter_shape.Dims(0);
+    filter_dims.h = filter_shape.Dims(1);
+    filter_dims.w = filter_shape.Dims(2);
+    filter_dims.c = output_depth;
+
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+    const int32_t buf_size = arm_depthwise_conv_wrapper_s8_get_buffer_size(
+      &dw_conv_params, &input_dims, &filter_dims, &output_dims);
+
+    auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type));
+
+    luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size};
+    scratchpad->resize(scratchpad_shape);
+  }
+  else
+  {
+    scratchpad->set_allocatable(false);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDequantize.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDequantize.h
new file mode 100644
index 000000000..15ff0327b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALDequantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename T>
+static inline void Dequantize(tflite::DequantizationParams &params,
+                              const tflite::RuntimeShape &input_shape, const T *input_data,
+                              const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape,
+                                               output_data);
+}
+
+static inline void Dequantize(tflite::DequantizationParams &params,
+                              const tflite::RuntimeShape &input_shape, const uint8_t *input_data,
+                              const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALElu.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALElu.h
new file mode 100644
index 000000000..4089d0a0c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALElu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/elu.h>
+
+namespace luci_interpreter_pal
+{
+
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                       const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALFullyConnected.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALFullyConnected.h
new file mode 100644
index 000000000..32e905761
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALFullyConnected.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+
+#include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void FullyConnected(const tflite::FullyConnectedParams &params,
+                                  const tflite::RuntimeShape &input_shape, const T *input_data,
+                                  const tflite::RuntimeShape &filter_shape, const T *filter_data,
+                                  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                                  const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  {
+    // MARK: At this moment this operation doesn't support
+    assert(false && "FullyConnected NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+  }
+}
+
+template <>
+inline void
+FullyConnected<int8_t>(const tflite::FullyConnectedParams &params,
+                       const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                       const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+                       const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                       const tflite::RuntimeShape &output_shape, int8_t *output_data)
+{
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = params.input_offset;
+  fc_params.output_offset = params.output_offset;
+  fc_params.filter_offset = params.weights_offset;
+  fc_params.activation.min = params.quantized_activation_min;
+  fc_params.activation.max = params.quantized_activation_max;
+
+  cmsis_nn_per_tensor_quant_params quant_params;
+  quant_params.multiplier = params.output_multiplier;
+  quant_params.shift = params.output_shift;
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = batches;
+  input_dims.h = 1;
+  input_dims.w = 1;
+  input_dims.c = accum_depth;
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = accum_depth;
+  filter_dims.h = 1;
+  filter_dims.w = 1;
+  filter_dims.c = output_depth;
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_depth;
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = batches;
+  output_dims.h = 1;
+  output_dims.w = 1;
+  output_dims.c = output_depth;
+
+  int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+  auto buffer = std::make_unique<int8_t[]>(buf_size);
+  assert(buffer != nullptr);
+
+  cmsis_nn_context ctx;
+  ctx.buf = buffer.get();
+  ctx.size = buf_size;
+
+  auto res =
+    arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
+                           filter_data, &bias_dims, bias_data, &output_dims, output_data);
+  assert(res == ARM_MATH_SUCCESS);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Normalize.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Normalize.h
new file mode 100644
index 000000000..f84742a44
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/reference/l2normalization.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+                                   const tflite::RuntimeShape &input_shape, const T *input_data,
+                                   const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+                                         output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Pool2D.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Pool2D.h
new file mode 100644
index 000000000..38a302fc6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+                          const T *input_data, const tflite::RuntimeShape &output_shape,
+                          T *output_data)
+{
+  tflite::reference_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALLeakyRelu.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALLeakyRelu.h
new file mode 100644
index 000000000..9ccd2224f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+                             const tflite::RuntimeShape &input_shape, const float *input_data,
+                             const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALMul.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALMul.h
new file mode 100644
index 000000000..347a97a83
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALMul.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/reference/mul.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                       const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                       const T *input2_data, const tflite::RuntimeShape &output_shape,
+                       T *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void
+BroadcastMul4DSlow(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                   const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                   const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALNeg.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALNeg.h
new file mode 100644
index 000000000..be5903a0c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/reference/neg.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALQuantize.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALQuantize.h
new file mode 100644
index 000000000..6046789ae
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALQuantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
+#define LUCI_INTERPRETER_PAL_QUANTIZE_H
+
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Quantize(tflite::QuantizationParams &params,
+                            const tflite::RuntimeShape &input_shape, const float *input_data,
+                            const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <typename Input, typename Output>
+static inline void Requantize(const Input *input_data, int32_t size,
+                              int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                              int32_t input_zero_point, int32_t output_zero_point,
+                              Output *output_data)
+{
+  tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier,
+                                    effective_scale_shift, input_zero_point, output_zero_point,
+                                    output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeBilinear.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeBilinear.h
new file mode 100644
index 000000000..cc9f0fd54
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+               const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+               const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+                                        output_size_shape, output_size_data,
+                                        unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeNearestNeighbor.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..f4d5a6ed3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+                      const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+                      const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+                      const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+                                               output_size_shape, output_size_data,
+                                               unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSVDF.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSVDF.h
new file mode 100644
index 000000000..a4a5b2a78
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSVDF.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SVDF_H
+#define LUCI_INTERPRETER_PAL_SVDF_H
+
+#include <arm_nn_types.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+IntegerSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+            const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape,
+            const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+            const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape,
+            const int32_t *bias_data, int16_t *activation_state_data,
+            const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data,
+            int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a,
+            int scale_2_b, int32_t input_zp, int32_t output_zp)
+{
+  const int32_t rank = params.rank;
+  const int32_t batch_size = input_shape.Dims(0);
+  const int32_t num_filters = weight_feature_shape.Dims(0);
+  const int32_t memory_size = weight_time_shape.Dims(1);
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_shape.Dims(0);
+  input_dims.h = input_shape.Dims(1);
+
+  cmsis_nn_dims weights_feature_dims;
+  weights_feature_dims.n = weight_feature_shape.Dims(0);
+  weights_feature_dims.h = weight_feature_shape.Dims(1);
+
+  cmsis_nn_dims weights_time_dims;
+  weights_time_dims.n = weight_time_shape.Dims(0);
+  weights_time_dims.h = weight_time_shape.Dims(1);
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = bias_shape.Dims(0);
+
+  cmsis_nn_dims state_dims;
+  state_dims.n = batch_size;
+  state_dims.h = memory_size * num_filters;
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = output_shape.Dims(0);
+  output_dims.h = output_shape.Dims(1);
+
+  cmsis_nn_svdf_params svdf_params;
+  svdf_params.rank = params.rank;
+  svdf_params.input_offset = input_zp;
+  svdf_params.output_offset = output_zp;
+
+  svdf_params.input_activation.min = INT16_MIN;
+  svdf_params.input_activation.max = INT16_MAX;
+
+  svdf_params.output_activation.min = INT8_MIN;
+  svdf_params.output_activation.max = INT8_MAX;
+
+  cmsis_nn_per_tensor_quant_params in_quant_params;
+  in_quant_params.multiplier = scale_1_a;
+  in_quant_params.shift = scale_1_b;
+
+  cmsis_nn_per_tensor_quant_params out_quant_params;
+  out_quant_params.multiplier = scale_2_a;
+  out_quant_params.shift = scale_2_b;
+
+  cmsis_nn_context scratch_ctx;
+  scratch_ctx.buf = scratchpad_data;
+
+  cmsis_nn_context scratch_output_ctx;
+  scratch_output_ctx.buf = output_temp_data;
+
+  arm_svdf_s8(&scratch_ctx, &scratch_output_ctx, &svdf_params, &in_quant_params, &out_quant_params,
+              &input_dims, input_data, &state_dims, activation_state_data, &weights_feature_dims,
+              weight_feature_data, &weights_time_dims, weight_time_data, &bias_dims, bias_data,
+              &output_dims, output_data);
+}
+static inline void
+FloatSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+          const float *input_data, const tflite::RuntimeShape &weight_feature_shape,
+          const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+          const float *weight_time_data, const tflite::RuntimeShape &bias_shape,
+          const float *bias_data, float *scratchpad_data, float *activation_state_data,
+          const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  const int32_t rank = params.rank;
+  const int32_t batch_size = input_shape.Dims(0);
+  const int32_t input_size = input_shape.Dims(1);
+  const int32_t num_filters = weight_feature_shape.Dims(0);
+  const int32_t num_units = num_filters / rank;
+  const int32_t memory_size = weight_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  {
+    float *new_state_start = activation_state_data;
+    const float *old_state_start = activation_state_data + 1;
+    const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end)
+    {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Compute conv1d(inputs, weights_feature).
+  // The activation_state's rightmost column is used to save current cycle
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.
+
+  // Perform batched matrix vector multiply operation:
+  {
+    const float *matrix = weight_feature_data;
+    const float *vector = input_data;
+    float *result = &activation_state_data[memory_size - 1];
+    float *result_in_batch = result;
+    for (int i = 0; i < batch_size; ++i)
+    {
+      const float *matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j)
+      {
+        float dot_prod = 0.0f;
+        const float *vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k)
+        {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+  }
+
+  tflite::reference_ops::ApplyTimeWeightsBiasAndActivation(
+    batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data,
+    params.activation, activation_state_data, scratchpad_data, output_data);
+}
+
+static inline void SetupScratchpadTensor(
+  const luci_interpreter::DataType &input_data_type,
+  const luci_interpreter::DataType &weight_feature_data_type,
+  luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2,
+  luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4,
+  luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6,
+  const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape,
+  const int32_t batch_size, const int32_t num_filters, const int32_t num_units)
+{
+  if (input_data_type == loco::DataType::FLOAT32 &&
+      (weight_feature_data_type == loco::DataType::S8 ||
+       weight_feature_data_type == loco::DataType::U8))
+  {
+    (void)input_shape;
+    (void)weight_time_shape;
+    (void)scratchpad_3;
+    (void)scratchpad_4;
+    (void)scratchpad_5;
+    (void)scratchpad_6;
+
+    throw std::runtime_error("Hybrid type is not supported for cmsisnn");
+  }
+
+  // Resize scratchpad_1 tensor
+  scratchpad_1->resize({batch_size, num_filters});
+
+  if (input_data_type == loco::DataType::S8)
+  {
+    // Resize scratchpad_2 for full_integer op
+    scratchpad_2->resize({batch_size, num_units});
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SVDF_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSoftmax.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSoftmax.h
new file mode 100644
index 000000000..6bbda4867
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSoftmax.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+#include <arm_nnfunctions.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  // Do nothing for mcu
+  (void)data;
+  (void)input_scale;
+  (void)beta;
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
+                                   &input_beta_multiplier, &input_beta_left_shift);
+
+  params->input_multiplier = input_beta_multiplier;
+  params->input_left_shift = input_beta_left_shift;
+  params->diff_min =
+    -tflite::CalculateInputRadius(kScaledDiffIntegerBits, params->input_left_shift);
+}
+
+template <typename T>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+                           const tflite::RuntimeShape &input_shape, const T *input_data,
+                           const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  // MARK: At this moment this operation doesn't support on mcu
+  assert(false && "Softmax NYI");
+  (void)params;
+  (void)input_shape;
+  (void)input_data;
+  (void)output_shape;
+  (void)output_data;
+}
+
+template <>
+inline void Softmax<int8_t>(const tflite::SoftmaxParams &params,
+                            const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                            const tflite::RuntimeShape &output_shape, int8_t *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int32_t mult = params.input_multiplier;
+  const int32_t shift = params.input_left_shift;
+  const int32_t diff_min = params.diff_min;
+
+  arm_softmax_s8(input_data, outer_size, depth, mult, shift, diff_min, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToBatchND.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToBatchND.h
new file mode 100644
index 000000000..fdddaa929
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+               const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToBatchND(
+    params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToDepth.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToDepth.h
new file mode 100644
index 000000000..816b7f663
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_depth.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSub.h b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSub.h
new file mode 100644
index 000000000..ea57578c6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/reference/sub.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+                       const tflite::RuntimeShape &input1_shape, const T *input1_data,
+                       const tflite::RuntimeShape &input2_shape, const T *input2_data,
+                       const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/cmsisnn/pal.cmake b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/pal.cmake
new file mode 100644
index 000000000..a68b363d9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/cmsisnn/pal.cmake
@@ -0,0 +1,65 @@
+macro(initialize_pal)
+    nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
+    nnas_find_package(CMSISSource EXACT 5.8.0 QUIET)
+
+    if (NOT TensorFlowSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: TensorFlow not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowGEMMLowpSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: gemmlowp not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowEigenSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Eigen not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowRuySource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Ruy not found")
+        return()
+    endif ()
+
+    if (NOT CMSISSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: CMSISSource not found")
+        return()
+    endif ()
+
+    set(PAL_INITIALIZED TRUE)
+endmacro()
+
+macro(add_pal_to_target TGT)
+    target_include_directories(${TGT} PRIVATE "${PAL}")
+    target_include_directories(${TGT} PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}")
+    target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR})
+
+    file(GLOB_RECURSE PAL_SOURCES "${CMSISSource_DIR}/CMSIS/NN/Source/*.c")
+    list(APPEND PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc)
+    add_library(luci_interpreter_cmsisnn_pal STATIC ${PAL_SOURCES})
+    set_property(TARGET luci_interpreter_cmsisnn_pal PROPERTY POSITION_INDEPENDENT_CODE ON)
+    target_include_directories(luci_interpreter_cmsisnn_pal PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}"
+    )
+
+    add_subdirectory(${CMSISSource_DIR}/CMSIS/NN ${CMAKE_CURRENT_BINARY_DIR}/CMSISNN)
+    target_include_directories(luci_interpreter_cmsisnn_pal PUBLIC
+            "${CMSISSource_DIR}/CMSIS/NN/Include"
+            "${CMSISSource_DIR}/CMSIS/DSP/Include"
+            "${CMSISSource_DIR}/CMSIS/Core/Include")
+
+    target_link_libraries(${TGT} PRIVATE luci_interpreter_cmsisnn_pal)
+endmacro()
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-micro/luci-interpreter/pal/linux/KernelsToBuild.lst
new file mode 100644
index 000000000..8e20559f9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -0,0 +1,77 @@
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchMatMul)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Dequantize)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Gather)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LocalResponseNormalization)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(LogSoftmax)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Mean)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(OneHot)
+REGISTER_KERNEL(Pack)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(Pow)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(Relu)
+REGISTER_KERNEL(Relu6)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(ReverseV2)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
+REGISTER_KERNEL(Slice)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(Split)
+REGISTER_KERNEL(SplitV)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(SVDF)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(Unpack)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALArgMax.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+                             const T2 *axis, const tflite::RuntimeShape &output_shape,
+                             T3 *output_data, const std::greater<T1> cmp)
+{
+  tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALAveragePool2d.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALAveragePool2d.h
new file mode 100644
index 000000000..cce30601f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALAveragePool2d.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void AveragePool(const tflite::PoolParams &params,
+                               const tflite::RuntimeShape &input_shape, const T *input_data,
+                               const tflite::RuntimeShape &output_shape, T *output_data,
+                               const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation doesn't support
+    assert(false && "AveragePool NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void AveragePool<int8_t>(const tflite::PoolParams &params,
+                                const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                                const tflite::RuntimeShape &output_shape, int8_t *output_data,
+                                const tflite::RuntimeShape &scratchpad_shape,
+                                int8_t *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+
+  tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
+                                             output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &output_shape)
+
+{
+  (void)input_data_type;
+  (void)input_shape;
+  (void)output_shape;
+
+  scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchMatMul.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchMatMul.h
new file mode 100644
index 000000000..3894f2d92
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchMatMul.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHMATMUL_H
+#define LUCI_INTERPRETER_PAL_BATCHMATMUL_H
+
+#include <tensorflow/lite/kernels/internal/reference/batch_matmul.h>
+
+namespace luci_interpreter_pal
+{
+inline void BatchMatMul(const tflite::RuntimeShape &lhs_shape, const float *lhs_data,
+                        const tflite::RuntimeShape &rhs_shape, const float *rhs_data,
+                        const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::BatchMatMul(lhs_shape, lhs_data, rhs_shape, rhs_data, output_shape,
+                                     output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *lhs_scratchpad,
+                                         luci_interpreter::Tensor *rhs_scratchpad,
+                                         const tflite::RuntimeShape &lhs_shape,
+                                         const tflite::RuntimeShape &rhs_shape)
+{
+  // Scratchpad for transposed LHS
+  {
+    auto lhs_rank = lhs_shape.DimensionsCount();
+    luci_interpreter::Shape scratchpad_size(lhs_rank);
+    for (int i = 0; i < lhs_rank - 2; ++i)
+    {
+      scratchpad_size.dim(i) = lhs_shape.Dims(i);
+    }
+    scratchpad_size.dim(lhs_rank - 2) = lhs_shape.Dims(lhs_rank - 1);
+    scratchpad_size.dim(lhs_rank - 1) = lhs_shape.Dims(lhs_rank - 2);
+
+    lhs_scratchpad->resize(scratchpad_size);
+  }
+  // Scratchpad for transposed RHS
+  {
+    auto rhs_rank = rhs_shape.DimensionsCount();
+    luci_interpreter::Shape scratchpad_size(rhs_rank);
+    for (int i = 0; i < rhs_rank - 2; ++i)
+    {
+      scratchpad_size.dim(i) = rhs_shape.Dims(i);
+    }
+    scratchpad_size.dim(rhs_rank - 2) = rhs_shape.Dims(rhs_rank - 1);
+    scratchpad_size.dim(rhs_rank - 1) = rhs_shape.Dims(rhs_rank - 2);
+
+    rhs_scratchpad->resize(scratchpad_size);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHMATMUL_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchToSpaceND.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchToSpaceND.h
new file mode 100644
index 000000000..3fe2022ed
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::BatchToSpaceND(
+    unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALConv2d.h
new file mode 100644
index 000000000..985a15f39
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALConv2d.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const float *input_data, const tflite::RuntimeShape &filter_shape,
+                        const float *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const float *bias_data, const tflite::RuntimeShape &output_shape,
+                        float *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        float *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  if (scratchpad_data)
+  {
+    const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3);
+    const int32_t output_height = output_shape.Dims(1);
+    const int32_t output_width = output_shape.Dims(2);
+    const int32_t filter_height = filter_shape.Dims(1);
+    const int32_t filter_width = filter_shape.Dims(2);
+    tflite::RuntimeShape im2col_shape{batches, output_height, output_width,
+                                      input_depth * filter_height * filter_width};
+
+    tflite::optimized_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                                bias_shape, bias_data, output_shape, output_data, im2col_shape,
+                                scratchpad_data);
+  }
+  else
+    tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                                bias_shape, bias_data, output_shape, output_data,
+                                tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+                        const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                        uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        uint8 *scratchpad_data)
+{
+  // TODO This should only be done once (although it takes only a few microseconds).
+  //  Also, the user should be able to adjust the number of threads.
+  auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
+  gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
+
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data, scratchpad_shape,
+                              scratchpad_data, gemmlowp_context.get());
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+                                  const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                                  const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+                                  const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                                  const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                                  int8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                                  int8 *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  // TODO enable optimized version
+  tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+                                                filter_shape, filter_data, bias_shape, bias_data,
+                                                output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::ConvParams &params,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+{
+  const int32_t filter_height = filter_shape.Dims(1);
+  const int32_t filter_width = filter_shape.Dims(2);
+
+  // Allocate tensor for scratchpad, if needed.
+  // The checks here should be aligned with the actual implementation.
+  const bool need_dilated_scratchpad =
+    params.dilation_height_factor != 1 || params.dilation_width_factor != 1;
+  const bool need_non_dilated_scratchpad = params.stride_height != 1 || params.stride_width != 1 ||
+                                           filter_height != 1 || filter_width != 1;
+  auto _need_scratchpad = input_data_type != luci_interpreter::DataType::S16 &&
+                          (need_dilated_scratchpad || need_non_dilated_scratchpad);
+
+  if (_need_scratchpad)
+  {
+    const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+    const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3);
+    const int32_t output_height = output_shape.Dims(1);
+    const int32_t output_width = output_shape.Dims(2);
+
+    auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type));
+    int32_t scratchpad_size = batches * output_width * output_height * input_depth * filter_height *
+                              filter_width * data_type_size;
+    luci_interpreter::Shape scratchpad_shape{scratchpad_size};
+    scratchpad->resize(scratchpad_shape);
+  }
+  else
+  {
+    scratchpad->set_allocatable(false);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthToSpace.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthToSpace.h
new file mode 100644
index 000000000..f9ebfcfb5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthwiseConv2d.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthwiseConv2d.h
new file mode 100644
index 000000000..c9d1a2948
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALDepthwiseConv2d.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+DepthwiseConvPerChannel(const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+                        const int32_t *output_shift, const tflite::RuntimeShape &input_shape,
+                        const T *input_data, const tflite::RuntimeShape &filter_shape,
+                        const T *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32_t *bias_data, const tflite::RuntimeShape &output_shape,
+                        T *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation is not supported
+    assert(false && "DepthwiseConvPerChannel NYI");
+    (void)params;
+    (void)output_multiplier;
+    (void)output_shift;
+    (void)input_shape;
+    (void)output_data;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void DepthwiseConvPerChannel<int8_t>(
+  const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+  const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+  const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+  const tflite::RuntimeShape &output_shape, int8_t *output_data,
+  const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const tflite::DepthwiseParams &params,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+
+{
+  (void)params;
+  (void)input_data_type;
+  (void)input_shape;
+  (void)filter_shape;
+  (void)output_shape;
+
+  scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALDequantize.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALDequantize.h
new file mode 100644
index 000000000..3af6d0777
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALDequantize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Dequantize(tflite::DequantizationParams &params,
+                              const tflite::RuntimeShape &input_shape, const T *input_data,
+                              const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Dequantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALElu.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALElu.h
new file mode 100644
index 000000000..cb365ffd0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALElu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                       const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALFullyConnected.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALFullyConnected.h
new file mode 100644
index 000000000..62970dbf7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALFullyConnected.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+
+#include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void FullyConnected(const tflite::FullyConnectedParams &params,
+                                  const tflite::RuntimeShape &input_shape, const T *input_data,
+                                  const tflite::RuntimeShape &filter_shape, const T *filter_data,
+                                  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                                  const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  {
+    // MARK: At this moment this operation doesn't support
+    assert(false && "FullyConnected NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+  }
+}
+
+template <>
+inline void
+FullyConnected<int8_t>(const tflite::FullyConnectedParams &params,
+                       const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                       const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+                       const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                       const tflite::RuntimeShape &output_shape, int8_t *output_data)
+{
+  tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
+                                                filter_data, bias_shape, bias_data, output_shape,
+                                                output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALGather.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALGather.h
new file mode 100644
index 000000000..49ac35f93
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALGather.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_GATHER_H
+#define LUCI_INTERPRETER_PAL_GATHER_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T, typename CoordsT = int32>
+static inline void Gather(const tflite::GatherParams &op_params,
+                          const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &coords_shape, const CoordsT *coords_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::Gather(op_params, input_shape, input_data, coords_shape, coords_data,
+                                output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_GATHER_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Normalize.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Normalize.h
new file mode 100644
index 000000000..6c663e21f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+                                   const tflite::RuntimeShape &input_shape, const T *input_data,
+                                   const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+                                         output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Pool2D.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Pool2D.h
new file mode 100644
index 000000000..aac57f2b2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+                          const T *input_data, const tflite::RuntimeShape &output_shape,
+                          T *output_data)
+{
+  tflite::optimized_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALLeakyRelu.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALLeakyRelu.h
new file mode 100644
index 000000000..e8209bae6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+                             const tflite::RuntimeShape &input_shape, const float *input_data,
+                             const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALLocalResponseNormalization.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALLocalResponseNormalization.h
new file mode 100644
index 000000000..54f7f0916
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALLocalResponseNormalization.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
+#define LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                           const tflite::RuntimeShape &input_shape, const float *input_data,
+                           const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::LocalResponseNormalization(op_params, input_shape, input_data,
+                                                    output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALLogSoftmax.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALLogSoftmax.h
new file mode 100644
index 000000000..a32e3eec6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALLogSoftmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
+#define LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  tflite::optimized_ops::PopulateSoftmaxLookupTable(data, input_scale, beta);
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  // Do nothing for linux
+  (void)params;
+  (void)input_scale;
+  (void)beta;
+}
+
+static inline void LogSoftmax(const tflite::SoftmaxParams &params, float input_scale,
+                              const tflite::RuntimeShape &input_shape, const uint8 *input_data,
+                              const tflite::RuntimeShape &output_shape, uint8 *output_data)
+{
+  tflite::optimized_ops::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
+                                    output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALMul.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALMul.h
new file mode 100644
index 000000000..a8a9d4abc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALMul.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                       const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                       const T *input2_data, const tflite::RuntimeShape &output_shape,
+                       T *output_data)
+{
+  tflite::optimized_ops::Mul(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+
+template <>
+inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                const int64_t *input1_data, const tflite::RuntimeShape &input2_shape,
+                const int64_t *input2_data, const tflite::RuntimeShape &output_shape,
+                int64_t *output_data)
+{
+  tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void
+BroadcastMul4DSlow(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                   const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                   const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALNeg.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALNeg.h
new file mode 100644
index 000000000..797ffee1b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALQuantize.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALQuantize.h
new file mode 100644
index 000000000..bf1d7954e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALQuantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
+#define LUCI_INTERPRETER_PAL_QUANTIZE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Quantize(tflite::QuantizationParams &params,
+                            const tflite::RuntimeShape &input_shape, const float *input_data,
+                            const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <typename Input, typename Output>
+static inline void Requantize(const Input *input_data, int32_t size,
+                              int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                              int32_t input_zero_point, int32_t output_zero_point,
+                              Output *output_data)
+{
+  tflite::optimized_ops::Requantize(input_data, size, effective_scale_multiplier,
+                                    effective_scale_shift, input_zero_point, output_zero_point,
+                                    output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu.h
new file mode 100644
index 000000000..b4c715d3e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RELU_H
+#define LUCI_INTERPRETER_PAL_RELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Relu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                        const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Relu(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void ReluX(const tflite::ReluParams &params, const tflite::RuntimeShape &input_shape,
+                         const T *input_data, const tflite::RuntimeShape &output_shape,
+                         T *output_data)
+{
+  tflite::optimized_ops::ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu6.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu6.h
new file mode 100644
index 000000000..bf2f91aa5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALRelu6.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RELU6_H
+#define LUCI_INTERPRETER_PAL_RELU6_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Relu6(const tflite::RuntimeShape &input_shape, const float *input_data,
+                         const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Relu6(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void ReluX(const tflite::ReluParams &params, const tflite::RuntimeShape &input_shape,
+                         const T *input_data, const tflite::RuntimeShape &output_shape,
+                         T *output_data)
+{
+  tflite::optimized_ops::ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RELU6_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeBilinear.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeBilinear.h
new file mode 100644
index 000000000..7380081dc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/optimized/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+               const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+               const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+                                        output_size_shape, output_size_data,
+                                        unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..74d19265b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+                      const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+                      const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+                      const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+                                               output_size_shape, output_size_data,
+                                               unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSVDF.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSVDF.h
new file mode 100644
index 000000000..0ffba14f0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSVDF.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SVDF_H
+#define LUCI_INTERPRETER_PAL_SVDF_H
+
+#include <tensorflow/lite/kernels/internal/reference/svdf.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+IntegerSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+            const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape,
+            const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+            const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape,
+            const int32_t *bias_data, int16_t *activation_state_data,
+            const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data,
+            int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a,
+            int scale_2_b, int32_t input_zp, int32_t output_zp)
+{
+  tflite::reference_ops::EvalIntegerSVDF(&params, input_shape, input_data, weight_feature_shape,
+                                         weight_feature_data, weight_time_shape, weight_time_data,
+                                         bias_shape, bias_data, activation_state_data, output_shape,
+                                         output_data, scratchpad_data, output_temp_data, scale_1_a,
+                                         scale_1_b, scale_2_a, scale_2_b, input_zp, output_zp);
+}
+static inline void
+FloatSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+          const float *input_data, const tflite::RuntimeShape &weight_feature_shape,
+          const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+          const float *weight_time_data, const tflite::RuntimeShape &bias_shape,
+          const float *bias_data, float *scratchpad_data, float *activation_state_data,
+          const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::EvalFloatSVDF(&params, input_shape, input_data, weight_feature_shape,
+                                       weight_feature_data, weight_time_shape, weight_time_data,
+                                       bias_shape, bias_data, scratchpad_data,
+                                       activation_state_data, output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(
+  const luci_interpreter::DataType &input_data_type,
+  const luci_interpreter::DataType &weight_feature_data_type,
+  luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2,
+  luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4,
+  luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6,
+  const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape,
+  const int32_t batch_size, const int32_t num_filters, const int32_t num_units)
+{
+
+  if (input_data_type == loco::DataType::FLOAT32 &&
+      (weight_feature_data_type == loco::DataType::S8 ||
+       weight_feature_data_type == loco::DataType::U8))
+  {
+    (void)input_shape;
+    (void)weight_time_shape;
+    (void)scratchpad_3;
+    (void)scratchpad_4;
+    (void)scratchpad_5;
+    (void)scratchpad_6;
+
+    throw std::runtime_error("Hybrid type is not currently supported for linux platform");
+  }
+
+  // Resize scratchpad_1 tensor
+  scratchpad_1->resize({batch_size, num_filters});
+
+  if (input_data_type == loco::DataType::S8)
+  {
+    // Resize scratchpad_2 for full_integer op
+    scratchpad_2->resize({batch_size, num_units});
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SVDF_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSlice.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSlice.h
new file mode 100644
index 000000000..640a71684
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSlice.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SLICE_H
+#define LUCI_INTERPRETER_PAL_SLICE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Slice(const tflite::SliceParams &op_params,
+                         const tflite::RuntimeShape &input_shape, const T *input_data,
+                         const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::Slice(op_params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SLICE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSoftmax.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSoftmax.h
new file mode 100644
index 000000000..b197e79d1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSoftmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  tflite::optimized_ops::PopulateSoftmaxLookupTable(data, input_scale, beta);
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  // Do nothing for linux
+  (void)params;
+  (void)input_scale;
+  (void)beta;
+}
+
+template <typename In, typename Out>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+                           const tflite::RuntimeShape &input_shape, const In *input_data,
+                           const tflite::RuntimeShape &output_shape, Out *output_data)
+{
+  tflite::optimized_ops::Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToBatchND.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToBatchND.h
new file mode 100644
index 000000000..5e8de9ba3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+               const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::SpaceToBatchND(
+    params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToDepth.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToDepth.h
new file mode 100644
index 000000000..52d2a5bb1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSplit.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSplit.h
new file mode 100644
index 000000000..4d8da72d8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSplit.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPLIT_H
+#define LUCI_INTERPRETER_PAL_SPLIT_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename Scalar>
+static inline void Split(const tflite::SplitParams &params, const tflite::RuntimeShape &input_shape,
+                         const Scalar *input_data, const tflite::RuntimeShape *const *output_shapes,
+                         Scalar *const *output_data)
+{
+  tflite::optimized_ops::Split(params, input_shape, input_data, output_shapes, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPLIT_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/PALSub.h b/compiler/luci-micro/luci-interpreter/pal/linux/PALSub.h
new file mode 100644
index 000000000..04080d619
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+                       const tflite::RuntimeShape &input1_shape, const T *input1_data,
+                       const tflite::RuntimeShape &input2_shape, const T *input2_data,
+                       const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/linux/pal.cmake b/compiler/luci-micro/luci-interpreter/pal/linux/pal.cmake
new file mode 100644
index 000000000..185700cf9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/linux/pal.cmake
@@ -0,0 +1,82 @@
+macro(initialize_pal)
+    nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
+
+    if (NOT TensorFlowSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: TensorFlow not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowGEMMLowpSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: gemmlowp not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowEigenSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Eigen not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowRuySource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Ruy not found")
+        return()
+    endif ()
+
+    find_package(Threads REQUIRED)
+
+    set(PAL_INITIALIZED TRUE)
+endmacro()
+
+macro(add_pal_to_target TGT)
+    target_include_directories(${TGT} PRIVATE "${PAL}")
+    target_include_directories(${TGT} SYSTEM PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}")
+    target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR})
+
+    # TODO put it back, I changed my mind.
+    # instead add sources with visitors in this library
+    set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
+
+    if(BUILD_ARM32_NEON)
+        # NOTE may need to revise this list for version upgrade
+        set(PAL_SOURCES ${PAL_SOURCES}
+                ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+                ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
+                ${TensorFlowRuySource_DIR}/ruy/allocator.cc
+                ${TensorFlowRuySource_DIR}/ruy/block_map.cc
+                ${TensorFlowRuySource_DIR}/ruy/blocking_counter.cc
+                ${TensorFlowRuySource_DIR}/ruy/context_get_ctx.cc
+                ${TensorFlowRuySource_DIR}/ruy/cpuinfo.cc
+                ${TensorFlowRuySource_DIR}/ruy/ctx.cc
+                ${TensorFlowRuySource_DIR}/ruy/denormal.cc
+                ${TensorFlowRuySource_DIR}/ruy/frontend.cc
+                ${TensorFlowRuySource_DIR}/ruy/pack_arm.cc
+                ${TensorFlowRuySource_DIR}/ruy/prepacked_cache.cc
+                ${TensorFlowRuySource_DIR}/ruy/prepare_packed_matrices.cc
+                ${TensorFlowRuySource_DIR}/ruy/system_aligned_alloc.cc
+                ${TensorFlowRuySource_DIR}/ruy/thread_pool.cc
+                ${TensorFlowRuySource_DIR}/ruy/trmul.cc
+                ${TensorFlowRuySource_DIR}/ruy/tune.cc
+                ${TensorFlowRuySource_DIR}/ruy/wait.cc
+                ${TensorFlowRuySource_DIR}/ruy/kernel_arm32.cc
+                )
+    endif(BUILD_ARM32_NEON)
+
+    add_library(luci_interpreter_linux_pal STATIC ${PAL_SOURCES})
+    set_target_properties(luci_interpreter_linux_pal PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_include_directories(luci_interpreter_linux_pal SYSTEM PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}"
+    )
+
+    target_link_libraries(${TGT} PRIVATE Threads::Threads luci_interpreter_linux_pal)
+endmacro()
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst
new file mode 100644
index 000000000..f0df58db3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -0,0 +1,62 @@
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Dequantize)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(SVDF)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALArgMax.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+                             const T2 *axis, const tflite::RuntimeShape &output_shape,
+                             T3 *output_data, const std::greater<T1> cmp)
+{
+  tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALAveragePool2d.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALAveragePool2d.h
new file mode 100644
index 000000000..cce30601f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALAveragePool2d.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void AveragePool(const tflite::PoolParams &params,
+                               const tflite::RuntimeShape &input_shape, const T *input_data,
+                               const tflite::RuntimeShape &output_shape, T *output_data,
+                               const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation doesn't support
+    assert(false && "AveragePool NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void AveragePool<int8_t>(const tflite::PoolParams &params,
+                                const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                                const tflite::RuntimeShape &output_shape, int8_t *output_data,
+                                const tflite::RuntimeShape &scratchpad_shape,
+                                int8_t *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+
+  tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
+                                             output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &output_shape)
+
+{
+  (void)input_data_type;
+  (void)input_shape;
+  (void)output_shape;
+
+  scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALBatchToSpaceND.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
new file mode 100644
index 000000000..4dd77ffdc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::BatchToSpaceND(
+    unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALConv2d.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALConv2d.h
new file mode 100644
index 000000000..13976877a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALConv2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/conv.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const float *input_data, const tflite::RuntimeShape &filter_shape,
+                        const float *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const float *bias_data, const tflite::RuntimeShape &output_shape,
+                        float *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        float *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data,
+                              tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+                        const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                        uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        uint8 *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data, scratchpad_shape,
+                              scratchpad_data, nullptr);
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+                                  const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                                  const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+                                  const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                                  const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                                  int8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                                  int8 *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+                                                filter_shape, filter_data, bias_shape, bias_data,
+                                                output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::ConvParams &params,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+{
+  (void)input_data_type;
+  (void)params;
+  (void)input_shape;
+  (void)filter_shape;
+  (void)output_shape;
+  scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthToSpace.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthToSpace.h
new file mode 100644
index 000000000..8463e571e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/reference/depth_to_space.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h
new file mode 100644
index 000000000..c9d1a2948
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+DepthwiseConvPerChannel(const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+                        const int32_t *output_shift, const tflite::RuntimeShape &input_shape,
+                        const T *input_data, const tflite::RuntimeShape &filter_shape,
+                        const T *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32_t *bias_data, const tflite::RuntimeShape &output_shape,
+                        T *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                        T *scratchpad_data)
+{
+  {
+    // MARK: At this moment this operation is not supported
+    assert(false && "DepthwiseConvPerChannel NYI");
+    (void)params;
+    (void)output_multiplier;
+    (void)output_shift;
+    (void)input_shape;
+    (void)output_data;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+    (void)scratchpad_shape;
+    (void)scratchpad_data;
+  }
+}
+
+template <>
+inline void DepthwiseConvPerChannel<int8_t>(
+  const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+  const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+  const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+  const tflite::RuntimeShape &output_shape, int8_t *output_data,
+  const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+                                         const tflite::DepthwiseParams &params,
+                                         const luci_interpreter::DataType &input_data_type,
+                                         const tflite::RuntimeShape &input_shape,
+                                         const tflite::RuntimeShape &filter_shape,
+                                         const tflite::RuntimeShape &output_shape)
+
+{
+  (void)params;
+  (void)input_data_type;
+  (void)input_shape;
+  (void)filter_shape;
+  (void)output_shape;
+
+  scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDequantize.h
new file mode 100644
index 000000000..15ff0327b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALDequantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename T>
+static inline void Dequantize(tflite::DequantizationParams &params,
+                              const tflite::RuntimeShape &input_shape, const T *input_data,
+                              const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape,
+                                               output_data);
+}
+
+static inline void Dequantize(tflite::DequantizationParams &params,
+                              const tflite::RuntimeShape &input_shape, const uint8_t *input_data,
+                              const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALElu.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALElu.h
new file mode 100644
index 000000000..4089d0a0c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALElu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/elu.h>
+
+namespace luci_interpreter_pal
+{
+
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                       const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALFullyConnected.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALFullyConnected.h
new file mode 100644
index 000000000..048624d74
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALFullyConnected.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+
+#include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void FullyConnected(const tflite::FullyConnectedParams &params,
+                                  const tflite::RuntimeShape &input_shape, const T *input_data,
+                                  const tflite::RuntimeShape &filter_shape, const T *filter_data,
+                                  const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                                  const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  {
+    // MARK: At this moment this operation is not supported
+    assert(false && "FullyConnected NYI");
+    (void)params;
+    (void)input_shape;
+    (void)input_data;
+    (void)filter_shape;
+    (void)filter_data;
+    (void)bias_shape;
+    (void)bias_data;
+    (void)output_shape;
+    (void)output_data;
+  }
+}
+
+template <>
+inline void
+FullyConnected<int8_t>(const tflite::FullyConnectedParams &params,
+                       const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+                       const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+                       const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+                       const tflite::RuntimeShape &output_shape, int8_t *output_data)
+{
+  tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
+                                                filter_data, bias_shape, bias_data, output_shape,
+                                                output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Normalize.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Normalize.h
new file mode 100644
index 000000000..f84742a44
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/reference/l2normalization.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+                                   const tflite::RuntimeShape &input_shape, const T *input_data,
+                                   const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+                                         output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Pool2D.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Pool2D.h
new file mode 100644
index 000000000..38a302fc6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+                          const T *input_data, const tflite::RuntimeShape &output_shape,
+                          T *output_data)
+{
+  tflite::reference_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALLeakyRelu.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALLeakyRelu.h
new file mode 100644
index 000000000..9ccd2224f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+                             const tflite::RuntimeShape &input_shape, const float *input_data,
+                             const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALMul.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALMul.h
new file mode 100644
index 000000000..347a97a83
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALMul.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/reference/mul.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                       const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                       const T *input2_data, const tflite::RuntimeShape &output_shape,
+                       T *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void
+BroadcastMul4DSlow(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                   const T *input1_data, const tflite::RuntimeShape &input2_shape,
+                   const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALNeg.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALNeg.h
new file mode 100644
index 000000000..be5903a0c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/reference/neg.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALQuantize.h
new file mode 100644
index 000000000..6046789ae
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALQuantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
+#define LUCI_INTERPRETER_PAL_QUANTIZE_H
+
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Quantize(tflite::QuantizationParams &params,
+                            const tflite::RuntimeShape &input_shape, const float *input_data,
+                            const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <typename Input, typename Output>
+static inline void Requantize(const Input *input_data, int32_t size,
+                              int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                              int32_t input_zero_point, int32_t output_zero_point,
+                              Output *output_data)
+{
+  tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier,
+                                    effective_scale_shift, input_zero_point, output_zero_point,
+                                    output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h
new file mode 100644
index 000000000..cc9f0fd54
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+               const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+               const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+                                        output_size_shape, output_size_data,
+                                        unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..f4d5a6ed3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+                      const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+                      const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+                      const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+                                               output_size_shape, output_size_data,
+                                               unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALSVDF.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSVDF.h
new file mode 100644
index 000000000..3bba668fb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSVDF.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SVDF_H
+#define LUCI_INTERPRETER_PAL_SVDF_H
+
+#include <tensorflow/lite/kernels/internal/reference/svdf.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+IntegerSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+            const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape,
+            const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+            const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape,
+            const int32_t *bias_data, int16_t *activation_state_data,
+            const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data,
+            int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a,
+            int scale_2_b, int32_t input_zp, int32_t output_zp)
+{
+  const int n_rank = params.rank;
+  const int n_batch = input_shape.Dims(0);
+  const int n_input = input_shape.Dims(1);
+  const int n_filter = weight_feature_shape.Dims(0);
+  const int n_unit = n_filter / n_rank;
+  const int n_memory = weight_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  {
+    int16_t *new_state_start = activation_state_data;
+    const int16_t *old_state_start = activation_state_data + 1;
+    const int16_t *old_state_end = activation_state_data + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end)
+    {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Feature matmul.
+  {
+    const int32_t output_max = std::numeric_limits<int16_t>::max();
+    const int32_t output_min = std::numeric_limits<int16_t>::min();
+    int16_t *result_in_batch = activation_state_data + (n_memory - 1);
+    for (int b = 0; b < n_batch; b++)
+    {
+      const int8_t *matrix_ptr = weight_feature_data;
+      for (int r = 0; r < n_filter; r++)
+      {
+        int32_t dot_prod = 0;
+        const int8_t *vector_in_batch = input_data + b * n_input;
+        for (int c = 0; c < n_input; c++)
+        {
+          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
+        }
+        dot_prod = tflite::MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
+        *result_in_batch = dot_prod;
+        result_in_batch += n_memory;
+      }
+    }
+  }
+
+  // Time.
+  {
+    for (int b = 0; b < n_batch; ++b)
+    {
+      int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter;
+
+      // Perform batched vector dot product:
+      const int16_t *vector1_ptr = weight_time_data;
+      const int16_t *vector2_ptr = activation_state_data + b * n_memory * n_filter;
+
+      for (int i = 0; i < n_filter; i++)
+      {
+        *scratch_ptr_batch = 0;
+        for (int j = 0; j < n_memory; j++)
+        {
+          *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+        }
+        scratch_ptr_batch++;
+      }
+    }
+  }
+
+  // Reduce, add bias, rescale, activation.
+  {
+    // Add bias.
+    if (bias_data)
+    {
+      // Vector batch assign:
+      for (int i = 0; i < n_batch; ++i)
+      {
+        int32_t *output_ptr = output_temp_data + i * n_unit;
+        const int32_t *bias_ptr = bias_data;
+        for (int j = 0; j < n_unit; ++j)
+        {
+          *output_ptr++ = *bias_ptr++;
+        }
+      }
+    }
+    else
+    {
+      int32_t *output_ptr = output_temp_data;
+      for (int i = 0; i < n_batch * n_unit; ++i)
+      {
+        *output_ptr++ = 0;
+      }
+    }
+
+    // Reduce.
+    for (int b = 0; b < n_batch; ++b)
+    {
+      int32_t *output_temp_ptr = output_temp_data + b * n_unit;
+      int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter;
+
+      // Reduction sum vector
+      for (int i = 0; i < n_unit; ++i)
+      {
+        for (int j = 0; j < n_rank; ++j)
+        {
+          output_temp_ptr[i] += *scratch_ptr_batch++;
+        }
+      }
+    }
+
+    // Rescale.
+    const int32_t output_max = std::numeric_limits<int8_t>::max();
+    const int32_t output_min = std::numeric_limits<int8_t>::min();
+    for (int i = 0; i < n_batch * n_unit; ++i)
+    {
+      int32_t x1 = output_temp_data[i];
+      int32_t x2 = tflite::MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x3 = x2 + output_zp;
+      int32_t x4 = std::min(std::max(output_min, x3), output_max);
+      output_data[i] = static_cast<int8_t>(x4);
+    }
+  }
+}
+static inline void
+FloatSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+          const float *input_data, const tflite::RuntimeShape &weight_feature_shape,
+          const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+          const float *weight_time_data, const tflite::RuntimeShape &bias_shape,
+          const float *bias_data, float *scratchpad_data, float *activation_state_data,
+          const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  const int32_t rank = params.rank;
+  const int32_t batch_size = input_shape.Dims(0);
+  const int32_t input_size = input_shape.Dims(1);
+  const int32_t num_filters = weight_feature_shape.Dims(0);
+  const int32_t num_units = num_filters / rank;
+  const int32_t memory_size = weight_time_shape.Dims(1);
+
+  // Left shift the activation_state.
+  {
+    float *new_state_start = activation_state_data;
+    const float *old_state_start = activation_state_data + 1;
+    const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end)
+    {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Compute conv1d(inputs, weights_feature).
+  // The activation_state's rightmost column is used to save current cycle
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.
+
+  // Perform batched matrix vector multiply operation:
+  {
+    const float *matrix = weight_feature_data;
+    const float *vector = input_data;
+    float *result = &activation_state_data[memory_size - 1];
+    float *result_in_batch = result;
+    for (int i = 0; i < batch_size; ++i)
+    {
+      const float *matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j)
+      {
+        float dot_prod = 0.0f;
+        const float *vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k)
+        {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+  }
+
+  tflite::reference_ops::ApplyTimeWeightsBiasAndActivation(
+    batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data,
+    params.activation, activation_state_data, scratchpad_data, output_data);
+}
+
+static inline void SetupScratchpadTensor(
+  const luci_interpreter::DataType &input_data_type,
+  const luci_interpreter::DataType &weight_feature_data_type,
+  luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2,
+  luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4,
+  luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6,
+  const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape,
+  const int32_t batch_size, const int32_t num_filters, const int32_t num_units)
+{
+
+  if (input_data_type == loco::DataType::FLOAT32 &&
+      (weight_feature_data_type == loco::DataType::S8 ||
+       weight_feature_data_type == loco::DataType::U8))
+  {
+    (void)input_shape;
+    (void)weight_time_shape;
+    (void)scratchpad_3;
+    (void)scratchpad_4;
+    (void)scratchpad_5;
+    (void)scratchpad_6;
+
+    throw std::runtime_error("Hybrid type is not currently supported for mcu platform");
+  }
+
+  // Resize scratchpad_1 tensor
+  scratchpad_1->resize({batch_size, num_filters});
+
+  if (input_data_type == loco::DataType::S8)
+  {
+    // Resize scratchpad_2 for full_integer op
+    scratchpad_2->resize({batch_size, num_units});
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SVDF_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALSoftmax.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSoftmax.h
new file mode 100644
index 000000000..9838b542d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSoftmax.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  // Do nothing for mcu
+  (void)data;
+  (void)input_scale;
+  (void)beta;
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
+                                   &input_beta_multiplier, &input_beta_left_shift);
+
+  params->input_multiplier = input_beta_multiplier;
+  params->input_left_shift = input_beta_left_shift;
+  params->diff_min =
+    -tflite::CalculateInputRadius(kScaledDiffIntegerBits, params->input_left_shift);
+}
+
+template <typename T>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+                           const tflite::RuntimeShape &input_shape, const T *input_data,
+                           const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  // MARK: At this moment this operation doesn't support on mcu
+  assert(false && "Softmax NYI");
+  (void)params;
+  (void)input_shape;
+  (void)input_data;
+  (void)output_shape;
+  (void)output_data;
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToBatchND.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
new file mode 100644
index 000000000..fdddaa929
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+               const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToBatchND(
+    params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToDepth.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToDepth.h
new file mode 100644
index 000000000..816b7f663
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_depth.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/PALSub.h b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSub.h
new file mode 100644
index 000000000..ea57578c6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/reference/sub.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+                       const tflite::RuntimeShape &input1_shape, const T *input1_data,
+                       const tflite::RuntimeShape &input2_shape, const T *input2_data,
+                       const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-micro/luci-interpreter/pal/mcu/pal.cmake b/compiler/luci-micro/luci-interpreter/pal/mcu/pal.cmake
new file mode 100644
index 000000000..907d51de6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/pal/mcu/pal.cmake
@@ -0,0 +1,56 @@
+macro(initialize_pal)
+    nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
+
+    if (NOT TensorFlowSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: TensorFlow not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowGEMMLowpSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: gemmlowp not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowEigenSource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Eigen not found")
+        return()
+    endif ()
+
+    if (NOT TensorFlowRuySource_FOUND)
+        message(STATUS "Skipping luci-interpreter: Ruy not found")
+        return()
+    endif ()
+    #find_package(Threads REQUIRED)
+
+    set(PAL_INITIALIZED TRUE)
+endmacro()
+
+macro(add_pal_to_target TGT)
+    target_include_directories(${TGT} PRIVATE "${PAL}")
+    target_include_directories(${TGT} PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}")
+    target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR})
+
+    # TODO put it back, I changed my mind.
+    # instead add sources with visitors in this library
+    set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc
+            ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc)
+    add_library(luci_interpreter_mcu_pal STATIC ${PAL_SOURCES})
+    set_target_properties(luci_interpreter_mcu_pal PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_include_directories(luci_interpreter_mcu_pal PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}"
+    )
+
+    target_link_libraries(${TGT} PRIVATE luci_interpreter_mcu_pal)
+    #target_link_libraries(${TGT} PRIVATE Threads::Threads luci_interpreter_mcu_pal)
+endmacro()
diff --git a/compiler/luci-micro/luci-interpreter/requires.cmake b/compiler/luci-micro/luci-interpreter/requires.cmake
new file mode 100644
index 000000000..f411f387a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/requires.cmake
@@ -0,0 +1 @@
+require(luci)
diff --git a/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.cpp b/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.cpp
new file mode 100644
index 000000000..6ad1f320c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/BuddyMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+BuddyMemoryManager::BuddyMemoryManager(uint8_t *memory_start, int32_t memSize)
+{
+  int32_t p = lowerLog2(memSize);
+
+  // We assume that the requested size of memory does not exceed 4 GB
+  assert(p < 32);
+  memSize = 1 << p;
+
+  _start_block = reinterpret_cast<Block *>(memory_start);
+  _start_block->size = memSize - sizeof(Block);
+  _start_block->is_free = true;
+  _start_block->self = _start_block;
+  _num_blocks = 0;
+  _size = _start_block->size;
+
+  for (auto &_free_block : _free_blocks)
+    _free_block = nullptr;
+
+  addToBlocks(_start_block, p);
+}
+
+void BuddyMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  const size_t element_size = getDataTypeSize(tensor.element_type());
+  const int32_t num_elements = tensor.shape().num_elements();
+  auto size = num_elements * element_size;
+  auto footprint = size + sizeof(Block);
+  auto l = (footprint & (footprint - 1)) == 0
+             ? lowerLog2(footprint)
+             : lowerLog2(footprint) + 1; // check footprint is pow_of_2
+
+  while (l < 32 && !_free_blocks[l])
+    l++;
+
+  assert(l < 32);
+
+  Block *tmp;
+  tmp = _free_blocks[l];
+  removeFromBlocks(tmp, l);
+
+  while ((tmp->size + sizeof(Block)) / 2 >= size + sizeof(Block))
+  {
+    divideBlock(tmp, l);
+    l--;
+  }
+
+  tmp->is_free = false;
+  tmp->self = tmp;
+  _num_blocks++;
+
+  auto *data = (uint8_t *)(tmp + 1);
+  tensor.set_data_buffer(data);
+}
+
+void BuddyMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  auto data = tensor.data<void>();
+  auto *tmp = (Block *)((uint8_t *)data - sizeof(Block));
+
+  assert(tmp->self == tmp);
+
+  tmp->is_free = true;
+  addToBlocks(tmp, lowerLog2(tmp->size + sizeof(Block)));
+
+  while (tmp)
+    if (tmp->size == _size)
+      break;
+    else
+      tmp = mergeBlock(tmp);
+
+  _num_blocks--;
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.test.cpp b/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.test.cpp
new file mode 100644
index 000000000..29fb767b7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/BuddyMemoryManager.test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/BuddyMemoryManager.h"
+#include <gtest/gtest.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(BuddyMemoryManager, basic)
+{
+  auto mem_pool = std::make_unique<uint8_t[]>(200);
+  auto buddy_memory_manager = std::make_unique<BuddyMemoryManager>(mem_pool.get(), 130);
+  Tensor first_tensor(DataType::U8, Shape({8}), AffineQuantization{}, "first_tensor");
+
+  buddy_memory_manager->allocate_memory(first_tensor);
+
+  uint8_t data_1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  first_tensor.writeData(data_1, 8);
+  uint8_t array_1[8];
+  first_tensor.readData(array_1, 8);
+  for (int i = 0; i < 8; i++)
+  {
+    EXPECT_EQ(data_1[i], array_1[i]);
+  }
+
+  Tensor second_tensor(DataType::U8, Shape({2, 5}), AffineQuantization{}, "second_tensor");
+  buddy_memory_manager->allocate_memory(second_tensor);
+
+  uint8_t data_2[2][5] = {{11, 22, 33, 44, 55}, {12, 23, 34, 45, 56}};
+  second_tensor.writeData(data_2, 10);
+
+  uint8_t array_2[2][5];
+  second_tensor.readData(array_2, 10);
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 5; j++)
+    {
+      EXPECT_EQ(data_2[i][j], array_2[i][j]);
+    }
+  }
+
+  buddy_memory_manager->release_memory(first_tensor);
+  EXPECT_EQ(first_tensor.data<void>(), nullptr);
+
+  buddy_memory_manager->release_memory(second_tensor);
+  EXPECT_EQ(second_tensor.data<void>(), nullptr);
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/src/CMakeLists.txt
new file mode 100644
index 000000000..997b75a84
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/CMakeLists.txt
@@ -0,0 +1,61 @@
+include("${LUCI_INTERPRETER_PAL_DIR}/pal.cmake")
+
+initialize_pal()
+
+if (NOT PAL_INITIALIZED)
+  message("PAL Failed to initialize, skip luci-interpreter")
+  return()
+endif()
+
+message(STATUS "LUCI INTERPRETER BEGIN")
+
+set(LUCI_INTERPRETER_BINARY "luci_interpreter${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_CORE "luci_interpreter_core${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_KERNELS "luci_interpreter_kernels${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_LOADER "luci_interpreter_loader${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_IMPORT "luci_interpreter_import${LUCI_INTERPRETER_SUFFIX}")
+
+add_subdirectory(core)
+message(STATUS "LUCI INTERPRETER CORE")
+add_subdirectory(kernels)
+message(STATUS "LUCI INTERPRETER KERNELS")
+add_subdirectory(loader)
+message(STATUS "LUCI INTERPRETER LOADER")
+add_subdirectory(import)
+message(STATUS "LUCI INTERPRETER IMPORT")
+
+message(STATUS "LUCI INTERPTER INITALIZED")
+
+set(SOURCES
+    "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/Interpreter.h"
+    Interpreter.cpp "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/SimpleMemoryManager.h" SimpleMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/TestMemoryManager.h" TestMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/BuddyMemoryManager.h" BuddyMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/StaticMemoryManager.h" StaticMemoryManager.cpp)
+
+if (NOT LUCI_INTERPRETER_STATIC)
+  add_library(${LUCI_INTERPRETER_BINARY} SHARED ${SOURCES})
+else ()
+  add_library(${LUCI_INTERPRETER_BINARY} STATIC ${SOURCES})
+endif ()
+
+set(TEST_SOURCES BuddyMemoryManager.test.cpp)
+
+target_include_directories(${LUCI_INTERPRETER_BINARY} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
+target_include_directories(${LUCI_INTERPRETER_BINARY} PRIVATE "${LUCI_INTERPRETER_SOURCE_DIR}")
+target_link_libraries(${LUCI_INTERPRETER_BINARY}
+    PUBLIC luci_lang ${LUCI_INTERPRETER_LOADER} ${LUCI_INTERPRETER_CORE}
+    PRIVATE nncc_common)
+
+install(TARGETS ${LUCI_INTERPRETER_BINARY} DESTINATION lib)
+install(DIRECTORY include/ DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(buddy_manager_test ${TEST_SOURCES})
+target_link_libraries(buddy_manager_test ${LUCI_INTERPRETER_BINARY})
diff --git a/compiler/luci-micro/luci-interpreter/src/Interpreter.cpp b/compiler/luci-micro/luci-interpreter/src/Interpreter.cpp
new file mode 100644
index 000000000..8cf272efd
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/Interpreter.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/Interpreter.h"
+#include "luci_interpreter/SimpleMemoryManager.h"
+
+#include "loader/ModuleLoader.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace
+{
+
+class EventNotifierImpl final : public EventNotifier
+{
+public:
+  EventNotifierImpl(const RuntimeToIR &runtime_to_ir,
+                    const std::vector<ExecutionObserver *> &observers)
+    : _runtime_to_ir(runtime_to_ir), _observers(observers)
+  {
+  }
+
+  void postTensorWrite(const Tensor *tensor) override
+  {
+    assert(tensor != nullptr);
+    for (const auto &observer : _observers)
+    {
+      observer->postTensorWrite(_runtime_to_ir.tensor_to_node.at(tensor), tensor);
+    }
+  }
+
+  void preOperatorExecute(const Kernel *kernel) override
+  {
+    assert(kernel != nullptr);
+    for (const auto &observer : _observers)
+    {
+      observer->preOperatorExecute(_runtime_to_ir.kernel_to_node.at(kernel));
+    }
+  }
+
+  void postOperatorExecute(const Kernel *kernel) override
+  {
+    assert(kernel != nullptr);
+    for (const auto &observer : _observers)
+    {
+      observer->postOperatorExecute(_runtime_to_ir.kernel_to_node.at(kernel));
+    }
+  }
+
+private:
+  const RuntimeToIR &_runtime_to_ir;
+  const std::vector<ExecutionObserver *> &_observers;
+};
+
+} // namespace
+
+Interpreter::Interpreter(const luci::Module *module)
+{
+  _runtime_to_ir = std::make_unique<RuntimeToIR>();
+  _event_notifier = std::make_unique<EventNotifierImpl>(*_runtime_to_ir, _observers);
+  _runtime_module = std::make_unique<RuntimeModule>(_event_notifier.get());
+
+  _default_memory_manager = std::make_unique<SimpleMemoryManager>();
+
+  ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor,
+                      _default_memory_manager.get());
+  loader.load();
+}
+
+Interpreter::Interpreter(const luci::Module *module,
+                         luci_interpreter::IMemoryManager *memory_manager)
+{
+  assert(memory_manager && "Use Interpreter::Interpreter(module) constructor instead");
+
+  _runtime_to_ir = std::make_unique<RuntimeToIR>();
+  _event_notifier = std::make_unique<EventNotifierImpl>(*_runtime_to_ir, _observers);
+  _runtime_module = std::make_unique<RuntimeModule>(_event_notifier.get());
+
+  ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor,
+                      memory_manager);
+  loader.load();
+}
+
+Interpreter::~Interpreter() = default;
+
+void Interpreter::writeInputTensor(const luci::CircleInput *input_node, const void *data,
+                                   size_t data_size)
+{
+  Tensor *tensor = _runtime_module->getInputTensors()[input_node->index()];
+  if (tensor == nullptr)
+  {
+    const std::string &name = input_node->name();
+    throw std::runtime_error("Cannot find tensor for input node named \"" + name + "\".");
+  }
+  if (data != nullptr)
+    tensor->writeData(data, data_size);
+}
+
+void Interpreter::readOutputTensor(const luci::CircleOutput *output_node, void *data,
+                                   size_t data_size)
+{
+  Tensor *tensor = _runtime_module->getOutputTensors()[output_node->index()];
+  if (tensor == nullptr)
+  {
+    const std::string &name = output_node->name();
+    throw std::runtime_error("Cannot find tensor for output node named \"" + name + "\".");
+  }
+  if (data != nullptr)
+    tensor->readData(data, data_size);
+}
+
+void Interpreter::interpret() { _runtime_module->execute(); }
+
+void Interpreter::attachObserver(ExecutionObserver *observer)
+{
+  if (std::find(_observers.cbegin(), _observers.cend(), observer) != _observers.cend())
+    throw std::runtime_error("Observer is already attached.");
+  _observers.push_back(observer);
+}
+
+ExecutionObserver::~ExecutionObserver() = default;
+
+void ExecutionObserver::postTensorWrite(const luci::CircleNode *, const Tensor *) {}
+
+void ExecutionObserver::preOperatorExecute(const luci::CircleNode *) {}
+
+void ExecutionObserver::postOperatorExecute(const luci::CircleNode *) {}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/SimpleMemoryManager.cpp b/compiler/luci-micro/luci-interpreter/src/SimpleMemoryManager.cpp
new file mode 100644
index 000000000..230e39896
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/SimpleMemoryManager.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/SimpleMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void SimpleMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  if (tensor.is_data_allocated())
+  {
+    release_memory(tensor);
+  }
+  const auto element_size = getDataTypeSize(tensor.element_type());
+  const auto num_elements = tensor.shape().num_elements();
+
+  auto *data = new uint8_t[num_elements * element_size];
+  tensor.set_data_buffer(data);
+}
+
+void SimpleMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_data_allocated())
+  {
+    tensor.set_data_buffer(nullptr);
+    return;
+  }
+  auto data = tensor.data<uint8_t>();
+  delete[] data;
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/StaticMemoryManager.cpp b/compiler/luci-micro/luci-interpreter/src/StaticMemoryManager.cpp
new file mode 100644
index 000000000..73a819919
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/StaticMemoryManager.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/StaticMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void StaticMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  int32_t offset = tensor.get_offset();
+  assert(offset >= 0);
+  auto tensor_ptr = _buffer_ptr + offset;
+  tensor.set_data_buffer(tensor_ptr);
+}
+
+void StaticMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/TestMemoryManager.cpp b/compiler/luci-micro/luci-interpreter/src/TestMemoryManager.cpp
new file mode 100644
index 000000000..3beeee55c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/TestMemoryManager.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void TestMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  if (tensor.is_data_allocated())
+  {
+    release_memory(tensor);
+  }
+  const auto element_size = getDataTypeSize(tensor.element_type());
+  const auto num_elements = tensor.shape().num_elements();
+
+  auto *data = new uint8_t[num_elements * element_size];
+  allocations.push_back(data);
+  tensor.set_data_buffer(data);
+}
+
+void TestMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/core/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/src/core/CMakeLists.txt
new file mode 100644
index 000000000..c2471e01c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(SOURCES
+    "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/core/DataType.h"
+    "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/core/Tensor.h"
+    EventNotifier.h
+    Kernel.h
+    KernelParams.h
+    RuntimeGraph.h
+    RuntimeGraph.cpp
+    RuntimeModule.h
+    Tensor.cpp)
+
+add_library(${LUCI_INTERPRETER_CORE} STATIC ${SOURCES})
+if (NOT NNCC_LIBRARY_NO_PIC)
+    set_target_properties(${LUCI_INTERPRETER_CORE} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif(NOT NNCC_LIBRARY_NO_PIC)
+target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
+target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
+target_link_libraries(${LUCI_INTERPRETER_CORE} PUBLIC luci_lang)
+target_link_libraries(${LUCI_INTERPRETER_CORE} PRIVATE nncc_common)
diff --git a/compiler/luci-micro/luci-interpreter/src/core/EventNotifier.h b/compiler/luci-micro/luci-interpreter/src/core/EventNotifier.h
new file mode 100644
index 000000000..5c4fbd3be
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/EventNotifier.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_EVENTNOTIFIER_H
+#define LUCI_INTERPRETER_CORE_EVENTNOTIFIER_H
+
+namespace luci_interpreter
+{
+
+// Used at execution stage to tell the interpreter that the runtime state has changed in some way.
+class EventNotifier
+{
+public:
+  virtual ~EventNotifier() = default;
+
+  virtual void postTensorWrite(const Tensor *tensor) = 0;
+  virtual void preOperatorExecute(const Kernel *kernel) = 0;
+  virtual void postOperatorExecute(const Kernel *kernel) = 0;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_EVENTNOTIFIER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/core/Kernel.h b/compiler/luci-micro/luci-interpreter/src/core/Kernel.h
new file mode 100644
index 000000000..a7c4a4218
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/Kernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_KERNEL_H
+#define LUCI_INTERPRETER_CORE_KERNEL_H
+
+#include "luci_interpreter/core/Tensor.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+
+// Base class for all kernels.
+class Kernel
+{
+protected:
+  Kernel(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs)
+    : _inputs(std::move(inputs)), _outputs(std::move(outputs))
+  {
+  }
+
+public:
+  virtual ~Kernel() = default;
+
+  const std::vector<const Tensor *> &getInputTensors() const { return _inputs; }
+  const std::vector<Tensor *> &getOutputTensors() const { return _outputs; }
+
+  // Configures the kernel.
+  // This function is currently called once for each kernel during interpreter construction,
+  // which makes it a convenient place for preparing (resizing) output tensors.
+  virtual void configure() = 0;
+
+  // Executes the kernel.
+  virtual void execute() const = 0;
+
+protected:
+  // NOTE Prefer not to use these in derived classes.
+  const std::vector<const Tensor *> _inputs;
+  const std::vector<Tensor *> _outputs;
+};
+
+// Base class for kernels with parameters.
+template <typename Params> class KernelWithParams : public Kernel
+{
+protected:
+  KernelWithParams(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs,
+                   const Params &params)
+    : Kernel(std::move(inputs), std::move(outputs)), _params(params)
+  {
+  }
+
+public:
+  const Params &params() const { return _params; }
+
+protected:
+  const Params _params;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_KERNEL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/core/KernelParams.h b/compiler/luci-micro/luci-interpreter/src/core/KernelParams.h
new file mode 100644
index 000000000..6c0220c62
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/KernelParams.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_KERNELPARAMS_H
+#define LUCI_INTERPRETER_CORE_KERNELPARAMS_H
+
+#include <luci/IR/AttrPadding.h>
+#include <luci/IR/AttrFusedActFunc.h>
+#include <luci/IR/AttrMirrorPadMode.h>
+#include <luci_interpreter/core/DataType.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace luci_interpreter
+{
+
+// Inject commonly used types into `luci_interpreter` namespace for convenience.
+using Activation = luci::FusedActFunc;
+using Padding = luci::Padding;
+using MirrorPadMode = luci::MirrorPadMode;
+
+struct AddParams
+{
+  Activation activation;
+};
+
+struct ArgMaxParams
+{
+  DataType output_type;
+};
+
+struct BatchMatMulParams
+{
+  bool adj_x;
+  bool adj_y;
+};
+
+struct ConcatenationParams
+{
+  int axis;
+  Activation activation;
+};
+
+struct Conv2DParams
+{
+  Padding padding;
+  int32_t stride_height;
+  int32_t stride_width;
+  int32_t dilation_height_factor;
+  int32_t dilation_width_factor;
+  Activation activation;
+};
+
+struct DepthToSpaceParams
+{
+  int block_size;
+};
+
+struct DepthwiseConv2DParams
+{
+  Padding padding;
+  int32_t depth_multiplier; // TODO Remove, as it can be calculated.
+  int32_t stride_height;
+  int32_t stride_width;
+  int32_t dilation_height_factor;
+  int32_t dilation_width_factor;
+  Activation activation;
+};
+
+struct DivParams
+{
+  Activation activation;
+};
+
+struct FullyConnectedParams
+{
+  Activation activation;
+  bool keep_num_dims = false;
+};
+
+struct GatherParams
+{
+  int32_t axis;
+  int32_t batch_dims;
+};
+
+struct InstanceNormParams
+{
+  float epsilon;
+  Activation activation;
+};
+
+struct L2NormParams
+{
+  Activation activation;
+};
+
+struct LeakyReluParams
+{
+  float alpha;
+};
+
+struct LocalResponseNormalizationParams
+{
+  int32_t radius;
+  float bias;
+  float alpha;
+  float beta;
+};
+
+struct MirrorPadParams
+{
+  MirrorPadMode mode;
+};
+
+struct MulParams
+{
+  Activation activation;
+};
+
+struct OneHotParams
+{
+  int32_t axis;
+};
+
+struct PackParams
+{
+  int32_t values_count;
+  int32_t axis;
+};
+
+struct Pool2DParams
+{
+  Padding padding;
+  int32_t filter_height;
+  int32_t filter_width;
+  int32_t stride_height;
+  int32_t stride_width;
+  Activation activation;
+};
+
+struct ReducerParams
+{
+  bool keep_dims;
+};
+
+struct ResizeBilinearParams
+{
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct ResizeNearestNeighborParams
+{
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
+struct ShapeParams
+{
+  loco::DataType out_type;
+};
+
+struct SubParams
+{
+  Activation activation;
+};
+
+struct SVDFParams
+{
+  bool asymmetric_quantize_inputs;
+  int32_t svdf_rank;
+  Activation activation;
+};
+
+struct SpaceToDepthParams
+{
+  int block_size;
+};
+
+struct SoftmaxParams
+{
+  float beta;
+};
+
+struct StridedSliceParams
+{
+  int32_t begin_mask;
+  int32_t end_mask;
+  int32_t ellipsis_mask;
+  int32_t new_axis_mask;
+  int32_t shrink_axis_mask;
+};
+
+struct SqueezeParams
+{
+  std::vector<int32_t> squeeze_dims;
+};
+
+struct TransposeConvParams
+{
+  Padding padding;
+  int32_t stride_height;
+  int32_t stride_width;
+};
+
+struct UnpackParams
+{
+  int axis;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_KERNELPARAMS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.cpp b/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.cpp
new file mode 100644
index 000000000..c2f8d2ea8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "core/RuntimeGraph.h"
+
+#include "core/RuntimeModule.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class RuntimeGraph::TensorAllocPlan
+{
+  std::vector<std::vector<Tensor *>> _alloc_plan;
+  std::vector<std::vector<Tensor *>> _dealloc_plan;
+  bool _valid = false;
+  IMemoryManager *_memory_manager;
+
+public:
+  explicit TensorAllocPlan(IMemoryManager *memory_manager);
+  void invalidate() { _valid = false; }
+  bool isValid() const { return _valid; }
+  void build(const RuntimeGraph &graph);
+  void allocate(size_t kernel_index) const;
+  void deallocate(size_t kernel_index) const;
+};
+
+RuntimeGraph::TensorAllocPlan::TensorAllocPlan(IMemoryManager *memory_manager)
+  : _memory_manager(memory_manager)
+{
+}
+
+void RuntimeGraph::TensorAllocPlan::build(const RuntimeGraph &graph)
+{
+  invalidate();
+  using Lifetime = std::pair<size_t, size_t>;
+  std::unordered_map<Tensor *, Lifetime> lifetimes;
+  const size_t num_kernels = graph._kernels.size();
+  for (size_t index = 0; index < num_kernels; ++index)
+  {
+    const auto &kernel = graph._kernels[index];
+    for (const Tensor *tensor : kernel->getInputTensors())
+    {
+      auto nc_tensor = const_cast<Tensor *>(tensor);
+      if (lifetimes.count(nc_tensor) > 0)
+        lifetimes.at(nc_tensor).second = index;
+    }
+    for (Tensor *tensor : kernel->getOutputTensors())
+    {
+      assert(lifetimes.count(tensor) == 0);
+      lifetimes[tensor] = Lifetime(index, index);
+    }
+  }
+  for (const Tensor *tensor : graph.getOutputTensors())
+  {
+    auto nc_tensor = const_cast<Tensor *>(tensor);
+    if (lifetimes.count(nc_tensor) > 0)
+      lifetimes.at(nc_tensor).second = num_kernels;
+  }
+  _alloc_plan.assign(num_kernels, std::vector<Tensor *>());
+  _dealloc_plan.assign(num_kernels + 1, std::vector<Tensor *>());
+  for (const auto &item : lifetimes)
+  {
+    _alloc_plan[item.second.first].push_back(item.first);
+    _dealloc_plan[item.second.second].push_back(item.first);
+  }
+  _valid = true;
+}
+
+void RuntimeGraph::TensorAllocPlan::allocate(size_t kernel_index) const
+{
+  assert(_valid && kernel_index < _alloc_plan.size());
+  for (Tensor *tensor : _alloc_plan[kernel_index])
+  {
+    _memory_manager->allocate_memory(*tensor);
+  }
+}
+
+void RuntimeGraph::TensorAllocPlan::deallocate(size_t kernel_index) const
+{
+  assert(_valid && kernel_index < _dealloc_plan.size());
+  for (Tensor *tensor : _dealloc_plan[kernel_index])
+  {
+    _memory_manager->release_memory(*tensor);
+  }
+}
+
+RuntimeGraph::RuntimeGraph(RuntimeModule *owning_module, IMemoryManager *memory_manager)
+  : _owning_module(owning_module), _memory_manager(memory_manager),
+    _tensor_alloc_plan(std::make_unique<TensorAllocPlan>(memory_manager))
+{
+}
+
+RuntimeGraph::~RuntimeGraph()
+{
+  for (auto &tensor : _tensors)
+  {
+    if (tensor->is_data_allocated())
+      _memory_manager->release_memory(*tensor);
+  }
+}
+
+Tensor *RuntimeGraph::addTensor(std::unique_ptr<Tensor> &&tensor)
+{
+  assert(tensor != nullptr);
+  _tensors.push_back(std::move(tensor));
+  return _tensors.back().get();
+}
+
+void RuntimeGraph::setInputTensors(const std::vector<Tensor *> &input_tensors)
+{
+  assert(std::all_of(input_tensors.cbegin(), input_tensors.cend(),
+                     [](Tensor *tensor) { return tensor != nullptr; }));
+  _input_tensors = input_tensors;
+}
+
+void RuntimeGraph::setOutputTensors(const std::vector<Tensor *> &output_tensors)
+{
+  assert(std::all_of(output_tensors.cbegin(), output_tensors.cend(),
+                     [](Tensor *tensor) { return tensor != nullptr; }));
+  _output_tensors = output_tensors;
+}
+
+void RuntimeGraph::configureAllocations(Tensor *tensor)
+{
+  _memory_manager->allocate_memory(*tensor);
+}
+
+void RuntimeGraph::addKernel(std::unique_ptr<Kernel> &&kernel)
+{
+  assert(kernel != nullptr);
+  _kernels.push_back(std::move(kernel));
+  _tensor_alloc_plan->invalidate();
+}
+
+void RuntimeGraph::execute() const
+{
+  if (!_tensor_alloc_plan->isValid())
+    _tensor_alloc_plan->build(*this);
+
+  EventNotifier *event_notifier = _owning_module->getEventNotifier();
+
+  // Notify the observers that the input tensors have changed.
+  if (event_notifier != nullptr)
+  {
+    for (const Tensor *input_tensor : getInputTensors())
+    {
+      if (input_tensor->is_observable())
+        event_notifier->postTensorWrite(input_tensor);
+    }
+  }
+
+  for (size_t index = 0; index < _kernels.size(); ++index)
+  {
+    const auto &kernel = _kernels[index];
+    if (event_notifier != nullptr)
+    {
+      event_notifier->preOperatorExecute(kernel.get());
+    }
+
+    // TODO The `configure` method should only be called if the outputs of an operator need to be
+    //  resized.
+    kernel->configure();
+
+    // Preallocate outputs in advance instead of relying on automatic allocation
+    _tensor_alloc_plan->allocate(index);
+
+    kernel->execute();
+
+    if (event_notifier != nullptr)
+    {
+      event_notifier->postOperatorExecute(kernel.get());
+    }
+
+    for (const Tensor *tensor : kernel->getOutputTensors())
+    {
+      if (event_notifier != nullptr && tensor->is_observable())
+      {
+        event_notifier->postTensorWrite(tensor);
+      }
+    }
+    _tensor_alloc_plan->deallocate(index);
+  }
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.h b/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.h
new file mode 100644
index 000000000..8184e249d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/RuntimeGraph.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_RUNTIMEGRAPH_H
+#define LUCI_INTERPRETER_CORE_RUNTIMEGRAPH_H
+
+#include "luci_interpreter/core/Tensor.h"
+#include "luci_interpreter/MemoryManager.h"
+#include "core/Kernel.h"
+
+#include <memory>
+#include <vector>
+
+namespace luci_interpreter
+{
+
+class RuntimeModule;
+
+class RuntimeGraph
+{
+private:
+  class TensorAllocPlan;
+  friend class TensorAllocPlan;
+
+public:
+  explicit RuntimeGraph(RuntimeModule *owning_module, IMemoryManager *memory_manager);
+  ~RuntimeGraph();
+
+  Tensor *addTensor(std::unique_ptr<Tensor> &&tensor);
+
+  void setInputTensors(const std::vector<Tensor *> &input_tensors);
+  void setOutputTensors(const std::vector<Tensor *> &output_tensors);
+
+  void configureAllocations(Tensor *tensor);
+
+  const std::vector<Tensor *> &getInputTensors() const { return _input_tensors; }
+  const std::vector<Tensor *> &getOutputTensors() const { return _output_tensors; }
+
+  void addKernel(std::unique_ptr<Kernel> &&kernel);
+
+  void execute() const;
+
+private:
+  IMemoryManager *_memory_manager;
+  RuntimeModule *_owning_module;
+  std::vector<std::unique_ptr<Tensor>> _tensors;
+  std::vector<Tensor *> _input_tensors;
+  std::vector<Tensor *> _output_tensors;
+
+  // Kernels in execution order.
+  std::vector<std::unique_ptr<Kernel>> _kernels;
+  // Tensors that are not used anymore after given op
+  std::unique_ptr<TensorAllocPlan> _tensor_alloc_plan;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_RUNTIMEGRAPH_H
diff --git a/compiler/luci-micro/luci-interpreter/src/core/RuntimeModule.h b/compiler/luci-micro/luci-interpreter/src/core/RuntimeModule.h
new file mode 100644
index 000000000..78873b0ec
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/RuntimeModule.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_CORE_RUNTIMEMODULE_H
+#define LUCI_INTERPRETER_CORE_RUNTIMEMODULE_H
+
+#include "core/RuntimeGraph.h"
+#include "core/EventNotifier.h"
+#include "luci_interpreter/MemoryManager.h"
+
+#include <memory>
+#include <vector>
+
+namespace luci_interpreter
+{
+
+class RuntimeModule
+{
+public:
+  explicit RuntimeModule(EventNotifier *event_notifier) : _event_notifier(event_notifier) {}
+
+  EventNotifier *getEventNotifier() const { return _event_notifier; }
+
+  RuntimeGraph *addGraph(IMemoryManager *memory_manager)
+  {
+    _graphs.push_back(std::make_unique<RuntimeGraph>(this, memory_manager));
+    return _graphs.back().get();
+  }
+
+  const std::vector<Tensor *> &getInputTensors() const { return getMainGraph()->getInputTensors(); }
+  const std::vector<Tensor *> &getOutputTensors() const
+  {
+    return getMainGraph()->getOutputTensors();
+  }
+
+  void execute() const { getMainGraph()->execute(); }
+
+private:
+  RuntimeGraph *getMainGraph() const { return _graphs[0].get(); }
+
+  EventNotifier *const _event_notifier;
+  std::vector<std::unique_ptr<RuntimeGraph>> _graphs;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_CORE_RUNTIMEMODULE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/core/Tensor.cpp b/compiler/luci-micro/luci-interpreter/src/core/Tensor.cpp
new file mode 100644
index 000000000..3c3c5ffff
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/core/Tensor.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/core/Tensor.h"
+
+#include <cstring>
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+Tensor::Tensor(DataType element_type, Shape shape, AffineQuantization quantization,
+               std::string name)
+  : _element_type(element_type), _shape(std::move(shape)), _quantization(std::move(quantization)),
+    _name(std::move(name)), _data_allocated(false)
+{
+}
+
+void Tensor::readData(void *data_ptr, size_t data_size) const
+{
+  const size_t element_size = getDataTypeSize(element_type());
+  const int32_t num_elements = shape().num_elements();
+  if (data_size != num_elements * element_size)
+  {
+    throw std::invalid_argument("Invalid data size.");
+  }
+  assert(data_ptr != nullptr);
+  std::memcpy(data_ptr, data<void>(), data_size);
+}
+
+void Tensor::writeData(const void *data_ptr, size_t data_size)
+{
+  const size_t element_size = getDataTypeSize(element_type());
+  const int32_t num_elements = shape().num_elements();
+  if (data_size != num_elements * element_size)
+  {
+    throw std::invalid_argument("Invalid data size.");
+  }
+  assert(data_ptr != nullptr);
+  std::memcpy(data<void>(), data_ptr, data_size);
+}
+
+void Tensor::resize(const Shape &new_shape) { _shape = new_shape; }
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/import/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/src/import/CMakeLists.txt
new file mode 100644
index 000000000..dd9733f92
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/import/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(SOURCES
+    "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/GraphBuilderRegistry.h"
+    GraphBuilderRegistry.cpp)
+
+# include specific builders
+file(GLOB_RECURSE NODES "Nodes/*")
+list(APPEND SOURCES ${NODES})
+
+add_library(${LUCI_INTERPRETER_IMPORT} STATIC ${SOURCES})
+if (NOT NNCC_LIBRARY_NO_PIC)
+  set_target_properties(${LUCI_INTERPRETER_IMPORT} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif(NOT NNCC_LIBRARY_NO_PIC)
+
+target_include_directories(${LUCI_INTERPRETER_IMPORT} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
+target_link_libraries(${LUCI_INTERPRETER_IMPORT} PUBLIC luci_import)
diff --git a/compiler/luci-micro/luci-interpreter/src/import/GraphBuilderRegistry.cpp b/compiler/luci-micro/luci-interpreter/src/import/GraphBuilderRegistry.cpp
new file mode 100644
index 000000000..a33bca6a4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/import/GraphBuilderRegistry.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci_interpreter/GraphBuilderRegistry.h"
+#include "Nodes/CircleReferencingConst.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<luci::GraphBuilderSource> source_without_constant_copying()
+{
+  auto builder = std::make_unique<luci::GraphBuilderRegistry>();
+  {
+    // redefine NodeBuilder of BUFFER type
+    builder->add(std::make_unique<CircleReferencingConstNodeBuilder>());
+  }
+
+  return builder;
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp b/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp
new file mode 100644
index 000000000..14e90f240
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleReferencingConst.h"
+
+#include <vector>
+
+namespace
+{
+
+// helper struct which describes data loaded to custom_options of CircleReferencingConst node
+struct ConstDataReference
+{
+  const uint8_t *data = nullptr;
+  uint32_t size = 0;
+};
+
+} // namespace
+
+namespace luci_interpreter
+{
+using namespace luci;
+
+CircleNode *CircleReferencingConstNodeBuilder::build(TensorIndex tensor_index,
+                                                     GraphBuilderContext *context) const
+{
+  assert(tensor_index >= 0);
+
+  const auto graph = context->graph();
+  const auto reader = context->reader();
+  const auto tensors = reader->tensors();
+  auto const const_tensor = tensors[tensor_index];
+  assert(const_tensor != nullptr);
+  if (const_tensor->is_variable())
+  {
+    // Create CircleVariable for variable
+    return nullptr;
+  }
+
+  auto const buffer = wrap(reader->buffers()[const_tensor->buffer()]->data());
+  auto const const_dims = wrap(const_tensor->shape()); // in NHWC
+  if (const_dims.empty() && buffer.empty())
+  {
+    // unknown shape tensor and scalar tensor
+    return nullptr;
+  }
+
+  // if tensor_index is used as output to some other operator, this is not a constant
+  auto tensoroutputs = context->tensoroutputs();
+  if (tensoroutputs->find(tensor_index))
+  {
+    // other operator output tensor
+    return nullptr;
+  }
+
+  uint32_t num_elements = 1;
+  for (uint32_t r = 0; r < const_dims.size(); ++r)
+  {
+    num_elements = num_elements * const_dims[r];
+  }
+
+  if (buffer.empty() && num_elements > 0)
+  {
+    // normal empty tensor
+    return nullptr;
+  }
+
+  // create CircleReferencingConst
+  auto custom_node = graph->nodes()->create<CircleCustom>(0, 1);
+  {
+    custom_node->custom_code("CircleReferencingConst");
+
+    copy_tensor_attributes(const_tensor, custom_node);
+    custom_node->shape_status(luci::ShapeStatus::VALID);
+
+    // custom options stores size of buffer and pointer's value to buffer's data
+    {
+      std::vector<uint8_t> custom_options(sizeof(ConstDataReference));
+      {
+        auto &const_data_ref = *reinterpret_cast<ConstDataReference *>(custom_options.data());
+        const_data_ref = {buffer.data(), buffer.size()};
+      }
+      custom_node->custom_options(custom_options);
+    }
+  }
+
+  // Output of CircleCustom node presented with CircleConstNode
+  auto out_node = graph->nodes()->create<CircleCustomOut>();
+  {
+    out_node->index(0);
+    out_node->input(custom_node);
+
+    copy_tensor_attributes(const_tensor, out_node);
+    out_node->shape_status(luci::ShapeStatus::VALID);
+  }
+
+  return out_node;
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.h b/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.h
new file mode 100644
index 000000000..ed8f95124
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/import/Nodes/CircleReferencingConst.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__
+#define __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__
+
+#include <luci/Import/NodeBuilder.h>
+
+#include <luci/IR/Nodes/CircleConst.h>
+
+namespace luci_interpreter
+{
+using namespace luci;
+
+/**
+ * @brief Builder creates CircleCustom node with pointer to constants data from Tensor with buffer.
+ */
+class CircleReferencingConstNodeBuilder : public TypedNodeBuilder<NodeBuilderType::BUFFER>
+{
+public:
+  CircleNode *build(TensorIndex tensor_index, GraphBuilderContext *ctx) const final;
+};
+
+} // namespace luci_interpreter
+
+#endif // __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Add.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Add.cpp
new file mode 100644
index 000000000..d7bf3084f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Add.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Add.h"
+
+#include "kernels/BinaryOpCommon.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/add.h>
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams &params)
+  : KernelWithParams<AddParams>({input1, input2}, {output}, params)
+{
+}
+
+void Add::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
+  if (input1()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 &&
+                           input2()->zero_points().size() == 1);
+    LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 &&
+                           output()->zero_point() == 0);
+  }
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Add::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Add::evalFloat() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<float>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastAdd4DSlow(
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                               getTensorShape(input2()), getTensorData<float>(input2()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+template <typename T> void Add::evalInteger() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<T>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastAdd4DSlow(
+      params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
+      getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<T>(input1()),
+                               getTensorShape(input2()), getTensorData<T>(input2()),
+                               getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+void Add::evalQuantized() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const int left_shift = 20;
+  const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
+  const double real_input1_multiplier = input1_scale / twice_max_input_scale;
+  const double real_input2_multiplier = input2_scale / twice_max_input_scale;
+  const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
+
+  int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
+  int input1_shift{}, input2_shift{}, output_shift{};
+  quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
+  quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
+  quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+  params.left_shift = left_shift;
+  // The kernel expects inputs' zero points to be negated.
+  params.input1_offset = -input1()->zero_point(); // Note the '-'.
+  params.input1_multiplier = input1_multiplier;
+  params.input1_shift = input1_shift;
+  params.input2_offset = -input2()->zero_point(); // Note the '-'.
+  params.input2_multiplier = input2_multiplier;
+  params.input2_shift = input2_shift;
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastAdd4DSlow(
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+                               getTensorShape(input2()), getTensorData<uint8_t>(input2()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+void Add::evalQuantizedS16() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  constexpr int left_shift = 12;
+  const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
+  const double real_input1_multiplier = input1_scale / twice_max_input_scale;
+  const double real_input2_multiplier = input2_scale / twice_max_input_scale;
+  const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
+
+  int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
+  int input1_shift{}, input2_shift{}, output_shift{};
+  quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
+  quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
+  quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  auto fn = [input1_multiplier, input1_shift, //
+             input2_multiplier, input2_shift, //
+             output_multiplier, output_shift, //
+             activation_min, activation_max](int16_t input1_val, int16_t input2_val) {
+    const int32_t shifted_input1_val = static_cast<int32_t>(input1_val) << left_shift;
+    const int32_t shifted_input2_val = static_cast<int32_t>(input2_val) << left_shift;
+    const int32_t scaled_input1_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, input1_multiplier, input1_shift);
+    const int32_t scaled_input2_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input2_val, input2_multiplier, input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      raw_sum, output_multiplier, output_shift);
+    const int32_t clamped_output = std::min(activation_max, std::max(activation_min, raw_output));
+    return static_cast<int16_t>(clamped_output);
+  };
+
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<int16_t>(input1()),
+                        getTensorShape(input2()), getTensorData<int16_t>(input2()),
+                        getTensorShape(output()), getTensorData<int16_t>(output()), fn);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Add.h b/compiler/luci-micro/luci-interpreter/src/kernels/Add.h
new file mode 100644
index 000000000..91d95b6af
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Add.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_ADD_H
+#define LUCI_INTERPRETER_KERNELS_ADD_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Add : public KernelWithParams<AddParams>
+{
+public:
+  Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_ADD_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Add.test.cpp
new file mode 100644
index 000000000..b8b1c3089
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Add.test.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Add.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class AddTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(float min, float max)
+{
+  float kQuantizedStep = (max - min) / 255.0;
+  return kQuantizedStep;
+}
+
+TEST_F(AddTest, Uint8)
+{
+  std::initializer_list<int32_t> base_shape = {2, 3, 1, 2};
+  std::initializer_list<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                            1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::initializer_list<int32_t> test_shapes[] = {
+    {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::initializer_list<float> test_data = {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::initializer_list<int32_t> output_shapes[] = {
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+  std::vector<std::vector<float>> output_data = {
+    {-0.1f, 2.6f,  -0.7f, 2.8f,  0.7f,  3.0f,  1.1f, 0.8f,  0.5f, 1.0f,  1.9f, 1.4f,
+     1.0f,  -0.8f, 0.4f,  -0.6f, 1.8f,  -0.2f, 1.4f, 3.0f,  0.8f, 3.0f,  2.2f, 3.0f,
+     -1.4f, 0.3f,  -2.0f, 0.5f,  -0.6f, 0.9f,  0.9f, -1.9f, 0.3f, -1.7f, 1.7f, -1.3f},
+    {-0.1f, 2.6f, 0.5f, 1.0f, 1.8f, -0.2f, 1.4f, 3.0f, -2.0f, 0.5f, 1.7f, -1.3f},
+    {-0.1f, 2.5f,  0.0f,  2.6f,  -0.7f, 1.9f,  1.1f, 0.7f,  1.2f, 0.8f,  0.5f, 0.1f,
+     1.0f,  -0.9f, 1.1f,  -0.8f, 0.4f,  -1.5f, 1.7f, 3.0f,  2.2f, 3.0f,  2.1f, 3.0f,
+     -1.1f, 0.5f,  -0.6f, 1.0f,  -0.7f, 0.9f,  1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
+    {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}};
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
+  for (int i = 0; i < output_data.size(); i++)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor output_tensor =
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    AddParams params{};
+    params.activation = Activation::NONE;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+  // Re-run with exchanged inputs.
+  for (int i = 0; i < output_data.size(); i++)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor output_tensor =
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    AddParams params{};
+    params.activation = Activation::NONE;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST_F(AddTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<float>> test_outputs = {
+    {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
+     1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
+     0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
+    {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
+    {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
+     1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
+     0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
+    {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                 1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+      << "With shape number " << i;
+  }
+}
+
+template <loco::DataType DType> void CheckInteger(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<dtype>> test_outputs = {
+    {3, 3, 0, 1, 0, 8, 5,  1, 0, 0, 2, 6, 8,  0, 1, 0, 5, 1,
+     5, 4, 0, 2, 2, 9, 11, 0, 4, 0, 8, 5, 11, 2, 4, 0, 8, 7},
+    {3, 3, 0, 0, 5, 1, 5, 4, 4, 0, 8, 7},
+    {3, 6, 0, 3, 0, 0, 5, 4, 2, 1, 0,  0, 8, 0, 5, 0, 1,  0,
+     0, 2, 2, 4, 7, 9, 6, 0, 8, 0, 13, 5, 6, 0, 8, 2, 13, 7},
+    {3, 6, 2, 1, 1, 0, 0, 2, 8, 0, 13, 7}};
+  std::vector<dtype> input1_data{-1, 2, 1, 0, 4, -5, 1, 3, 7, -1, 7, 1};
+  std::vector<dtype> input2_data{4, 1, -3, -1, 1, 6};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+};
+
+TEST_F(AddTest, SInt32)
+{
+  CheckInteger<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(AddTest, SInt64)
+{
+  CheckInteger<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(AddTest, SInt16)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<int32_t>> ref_output_shapes{
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                 1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::vector<std::vector<float>> ref_outputs = {
+    {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
+     1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
+     0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
+    {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
+    {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
+     1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
+     0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
+    {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
+
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
+    const float tolerance = output_tensor.scale();
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+      << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs and different scales.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 5.0 / 32767, 0);
+    const float tolerance = output_tensor.scale();
+
+    AddParams params{};
+    params.activation = Activation::RELU;
+
+    Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+      << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
+      << "With shape number " << i;
+  }
+}
+
+TEST_F(AddTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  AddParams params{};
+  params.activation = Activation::RELU;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(AddTest, Invalid_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  AddParams params{};
+  params.activation = Activation::RELU;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(AddTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U64);
+
+  AddParams params{};
+  params.activation = Activation::RELU;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(AddTest, Invalid_Quantization_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S16>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S16>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16);
+
+  AddParams params{};
+  params.activation = Activation::NONE;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.cpp
new file mode 100644
index 000000000..6561a1783
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ArgMax.h"
+#include "kernels/Utils.h"
+#include "PALArgMax.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ArgMax::ArgMax(const Tensor *input, const Tensor *axis, Tensor *output, const ArgMaxParams &params)
+  : KernelWithParams<ArgMaxParams>({input, axis}, {output}, params)
+{
+}
+
+void ArgMax::configure()
+{
+  assert(axis()->element_type() == DataType::S32 || axis()->element_type() == DataType::S64);
+  assert(input()->shape().num_dims() >= 1);
+  const Shape &input_shape = input()->shape();
+  const int num_dims = input_shape.num_dims();
+  Shape output_shape(num_dims - 1);
+
+  // If axis value is negative, then update by adding input_shape's num_dims.
+  // If updated value also negative, then assert.
+  assert(axis()->shape().num_elements() == 1);
+  int axis_value = getTensorData<int32_t>(axis())[0];
+  if (axis_value < 0)
+    axis_value = axis_value + num_dims;
+  assert(axis_value >= 0);
+
+  int j = 0;
+  for (int i = 0; i < num_dims; i++)
+  {
+    if (i == axis_value)
+      continue;
+    output_shape.dim(j++) = input_shape.dim(i);
+  }
+
+  assert(output()->element_type() == _params.output_type);
+
+  output()->resize(output_shape);
+}
+
+void ArgMax::execute() const
+{
+
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                                    \
+  luci_interpreter_pal::ArgMinMax(getTensorShape(input()), getTensorData<data_type>(input()), \
+                                  getTensorData<axis_type>(axis()), getTensorShape(output()), \
+                                  getTensorData<output_type>(output()), std::greater<data_type>())
+  if (axis()->element_type() == DataType::S32)
+  {
+    switch (_params.output_type)
+    {
+      case DataType::S32:
+        switch (input()->element_type())
+        {
+          case DataType::FLOAT32:
+            TF_LITE_ARG_MAX(float, int32_t, int32_t);
+            break;
+          case DataType::U8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t);
+            break;
+          default:
+            throw std::runtime_error("Unsupported input type.");
+        }
+        break;
+      case DataType::S64:
+        switch (input()->element_type())
+        {
+          case DataType::FLOAT32:
+            TF_LITE_ARG_MAX(float, int32_t, int64_t);
+            break;
+          case DataType::U8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t);
+            break;
+          default:
+            throw std::runtime_error("Unsupported input type.");
+        }
+        break;
+      default:
+        throw std::runtime_error("Unsupported output type.");
+    }
+  }
+  else
+  {
+    switch (_params.output_type)
+    {
+      case DataType::S32:
+        switch (input()->element_type())
+        {
+          case DataType::FLOAT32:
+            TF_LITE_ARG_MAX(float, int64_t, int32_t);
+            break;
+          case DataType::U8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t);
+            break;
+          default:
+            throw std::runtime_error("Unsupported input type.");
+        }
+        break;
+      case DataType::S64:
+        switch (input()->element_type())
+        {
+          case DataType::FLOAT32:
+            TF_LITE_ARG_MAX(float, int64_t, int64_t);
+            break;
+          case DataType::U8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t);
+            break;
+          default:
+            throw std::runtime_error("Unsupported input type.");
+        }
+        break;
+      default:
+        throw std::runtime_error("Unsupported output type.");
+    }
+  }
+#undef TF_LITE_ARG_MAX
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.h b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.h
new file mode 100644
index 000000000..c851b5891
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_ARGMAX_H
+#define LUCI_INTERPRETER_KERNELS_ARGMAX_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ArgMax : public KernelWithParams<ArgMaxParams>
+{
+public:
+  ArgMax(const Tensor *input, const Tensor *axis, Tensor *output, const ArgMaxParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axis() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_ARGMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.test.cpp
new file mode 100644
index 000000000..474f4b321
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ArgMax.test.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ArgMax.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T1, typename T2>
+void Check(std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> dimension_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<T1> input_data,
+           std::initializer_list<int32_t> dimension_data, std::initializer_list<T2> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T1>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor dimension_tensor =
+    makeInputTensor<DataType::S32>(dimension_shape, dimension_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(getElementType<T2>());
+
+  ArgMaxParams params{};
+  params.output_type = getElementType<T2>();
+  ArgMax kernel(&input_tensor, &dimension_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T> class ArgMaxTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(ArgMaxTest, DataTypes);
+
+TYPED_TEST(ArgMaxTest, Simple)
+{
+  Check<TypeParam, int32_t>(/*input_shape=*/{1, 1, 1, 4}, /*dimension_shape=*/{},
+                            /*output_shape=*/{1, 1, 1},
+                            /*input_data=*/
+                            {
+                              1, 9, 7, 3, //
+                            },
+                            /*dimension_data=*/{3}, /*output_data=*/{1});
+  Check<TypeParam, int64_t>(/*input_shape=*/{1, 1, 1, 4}, /*dimension_shape=*/{},
+                            /*output_shape=*/{1, 1, 1},
+                            /*input_data=*/
+                            {
+                              1, 9, 7, 3, //
+                            },
+                            /*dimension_data=*/{3}, /*output_data=*/{1});
+}
+
+TYPED_TEST(ArgMaxTest, MultiDimensions)
+{
+  Check<TypeParam, int32_t>(/*input_shape=*/{1, 1, 2, 4}, /*dimension_shape=*/{},
+                            /*output_shape=*/{1, 1, 2},
+                            /*input_data=*/
+                            {
+                              1, 2, 7, 8, //
+                              1, 9, 7, 3, //
+                            },
+                            /*dimension_data=*/{3}, /*output_data=*/{3, 1});
+  Check<TypeParam, int64_t>(/*input_shape=*/{1, 1, 2, 4}, /*dimension_shape=*/{},
+                            /*output_shape=*/{1, 1, 2},
+                            /*input_data=*/
+                            {
+                              1, 2, 7, 8, //
+                              1, 9, 7, 3, //
+                            },
+                            /*dimension_data=*/{3}, /*output_data=*/{3, 1});
+}
+
+TEST(ArgMaxTest, UnsupportedType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4},
+                                                           {
+                                                             1, 2, 7, 8, //
+                                                             1, 9, 7, 3, //
+                                                           },
+                                                           memory_manager.get());
+  Tensor dimension_tensor = makeInputTensor<DataType::S32>({}, {3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  ArgMaxParams params{};
+  params.output_type = DataType::U8;
+  ArgMax kernel(&input_tensor, &dimension_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.cpp
new file mode 100644
index 000000000..d3bade9e4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/AveragePool2D.h"
+
+#include "kernels/Utils.h"
+
+#include "PALAveragePool2d.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+AveragePool2D::AveragePool2D(const Tensor *input, Tensor *output, Tensor *scratchpad,
+                             const Pool2DParams &params)
+  : KernelWithParams<Pool2DParams>({input}, {output, scratchpad}, params)
+{
+}
+
+void AveragePool2D::configure()
+{
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Input Tensor and Output Tensor Type must be same");
+  }
+  if (input()->shape().num_dims() != 4)
+  {
+    throw std::runtime_error("Input Tensor Shape must be 4-D");
+  }
+  const Shape &input_shape = input()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t depth = input_shape.dim(3);
+
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, _params.filter_height, _params.stride_height);
+  const int32_t output_width =
+    computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
+
+  _padding_height =
+    computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
+  _padding_width =
+    computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+  else if (input()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
+  output()->resize({batches, output_height, output_width, depth});
+
+  auto scratchpad = getOutputTensors()[1];
+  luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(),
+                                              getTensorShape(input()), getTensorShape(output()));
+}
+
+void AveragePool2D::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalSInt16();
+      break;
+    case DataType::S8:
+      evalSInt8();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void AveragePool2D::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+
+  tflite::reference_ops::AveragePool(params, getTensorShape(input()), getTensorData<float>(input()),
+                                     getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void AveragePool2D::evalQuantized() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_ops::AveragePool(params, getTensorShape(input()),
+                                     getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                     getTensorData<uint8_t>(output()));
+}
+
+void AveragePool2D::evalSInt8() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  auto scratchpad = getOutputTensors()[1];
+  int8_t *scratchpad_data = nullptr;
+  if (scratchpad->is_allocatable())
+    scratchpad_data = scratchpad->data<int8_t>();
+
+  luci_interpreter_pal::AveragePool<int8_t>(
+    params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
+}
+
+void AveragePool2D::evalSInt16() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_integer_ops::AveragePool(
+    params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+    getTensorShape(output()), getTensorData<int16_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.h b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.h
new file mode 100644
index 000000000..2c8fe16e7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_AVERAGEPOOL2D_H
+#define LUCI_INTERPRETER_KERNELS_AVERAGEPOOL2D_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class AveragePool2D : public KernelWithParams<Pool2DParams>
+{
+public:
+  AveragePool2D(const Tensor *input, Tensor *output, Tensor *scratchpad,
+                const Pool2DParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalSInt16() const;
+  void evalSInt8() const;
+
+private:
+  int32_t _padding_height{};
+  int32_t _padding_width{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_AVERAGEPOOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.test.cpp
new file mode 100644
index 000000000..478bfa68e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/AveragePool2D.test.cpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/AveragePool2D.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class AveragePool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(AveragePool2DTest, Float)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<float> input_data{
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(scratchpad);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    0, 1.5, //
+    4.5, 6, //
+  };
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
+}
+
+TEST_F(AveragePool2DTest, Uint8_0)
+{
+  std::vector<float> input_data{
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
+  };
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+  Tensor scratchpad(DataType::U8, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(scratchpad);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0.0, 6.0}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST_F(AveragePool2DTest, Uint8_1)
+{
+  std::vector<float> input_data{
+    0, 6, 12, 4, //
+    3, 2, 10, 7, //
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+  Tensor scratchpad(DataType::U8, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({2.75, 6.0}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST_F(AveragePool2DTest, SInt16)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
+  };
+  std::vector<float> ref_output_data{
+    0, 1.5, //
+    4.5, 6, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+  Tensor scratchpad(DataType::S16, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(scratchpad);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(AveragePool2DTest, SInt8)
+{
+  Shape input_shape{1, 4, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{-7, -3, 0,  2, -5, 12, -15, 3,  10, 5,
+                                7,  -6, -1, 9, -2, 0,  -5,  11, -1, -7};
+  std::vector<float> ref_output_data{
+    0, 2.5, //
+    1, 1.5, //
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<int8_t>(-15.9375f, 15.9375f);
+  Tensor input_tensor = makeInputTensor<DataType::S8>(
+    input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, quant_param.first, quant_param.second);
+  Tensor scratchpad(DataType::S8, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(scratchpad);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(AveragePool2DTest, Invalid_Input_Shape_NEG)
+{
+  Shape input_shape{1, 3, 5};
+  std::vector<float> input_data{
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(AveragePool2DTest, In_Out_Type_NEG)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<float> input_data{
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(AveragePool2DTest, Quant_Param_NEG)
+{
+  std::vector<float> input_data{
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
+  };
+
+  std::pair<float, int32_t> quant_param1 = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
+  std::pair<float, int32_t> quant_param2 = quantizationParams<uint8_t>(-7.875f, 7.875f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param1.first, quant_param1.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param2.first, quant_param2.second);
+  Tensor scratchpad(DataType::U8, Shape({}), {}, "");
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.cpp
new file mode 100644
index 000000000..24ca22996
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchMatMul.h"
+#include "kernels/Utils.h"
+
+#include "PALBatchMatMul.h"
+
+#include <tensorflow/lite/kernels/internal/reference/transpose.h>
+
+#include <stdexcept>
+
+namespace
+{
+
+tflite::RuntimeShape SwapRowColumnDims(const tflite::RuntimeShape &shape)
+{
+  tflite::RuntimeShape swapped_shape(shape);
+  const int32_t dims = shape.DimensionsCount();
+  swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
+  swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
+  return swapped_shape;
+}
+
+} // namespace
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+BatchMatMul::BatchMatMul(const Tensor *x, const Tensor *y, Tensor *output, Tensor *x_tmp,
+                         Tensor *y_tmp, const BatchMatMulParams &params)
+  : KernelWithParams({x, y}, {output, x_tmp, y_tmp}, params)
+{
+}
+
+void BatchMatMul::configure()
+{
+  auto lhs = x();
+  auto rhs = y();
+  auto adj_x = params().adj_x;
+  auto adj_y = params().adj_y;
+
+  // TODO Support non-float types
+  if (lhs->element_type() != DataType::FLOAT32 || rhs->element_type() != DataType::FLOAT32)
+    throw std::runtime_error("Unsupported type.");
+
+  LUCI_INTERPRETER_CHECK(lhs->element_type() == rhs->element_type());
+
+  auto lhs_rank = lhs->shape().num_dims();
+  auto rhs_rank = rhs->shape().num_dims();
+  LUCI_INTERPRETER_CHECK(lhs_rank >= 2 && lhs_rank <= 4);
+  LUCI_INTERPRETER_CHECK(rhs_rank >= 2 && rhs_rank <= 4);
+
+  auto lhs_scratchpad = temp_lhs();
+  auto rhs_scratchpad = temp_rhs();
+  luci_interpreter_pal::SetupScratchpadTensor(lhs_scratchpad, rhs_scratchpad, getTensorShape(lhs),
+                                              getTensorShape(rhs));
+
+  auto output_rank = std::max(lhs_rank, rhs_rank);
+
+  auto extended_lhs_shape = tflite::RuntimeShape::ExtendedShape(output_rank, getTensorShape(lhs));
+  auto extended_rhs_shape = tflite::RuntimeShape::ExtendedShape(output_rank, getTensorShape(rhs));
+
+  // Ensure any batch dimensions obey broacasting rules.
+  for (int i = 0; i < output_rank - 2; ++i)
+  {
+    const int lhs_dim = extended_lhs_shape.Dims(i);
+    const int rhs_dim = extended_rhs_shape.Dims(i);
+    if (lhs_dim != rhs_dim)
+    {
+      if (lhs_dim != 1)
+      {
+        LUCI_INTERPRETER_CHECK(rhs_dim == 1);
+      }
+    }
+  }
+
+  // Ensure other dimensions work for matrix multiplication.
+  int accum_dim_lhs =
+    adj_x ? extended_lhs_shape.Dims(output_rank - 2) : extended_lhs_shape.Dims(output_rank - 1);
+  int accum_dim_rhs =
+    adj_y ? extended_rhs_shape.Dims(output_rank - 1) : extended_rhs_shape.Dims(output_rank - 2);
+  LUCI_INTERPRETER_CHECK(accum_dim_lhs == accum_dim_rhs);
+
+  Shape output_shape(output_rank);
+  // Fill in any broadcast dimensions.
+  for (int i = 0; i < output_rank - 2; ++i)
+  {
+    const int lhs_dim = extended_lhs_shape.Dims(i);
+    const int rhs_dim = extended_rhs_shape.Dims(i);
+    int broadcast_dim = lhs_dim;
+    if ((lhs_dim != rhs_dim) && (lhs_dim == 1))
+    {
+      broadcast_dim = rhs_dim;
+    }
+    output_shape.dim(i) = broadcast_dim;
+  }
+  // Fill in the matmul dimensions.
+  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
+  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
+
+  output_shape.dim(output_rank - 2) = extended_lhs_shape.Dims(lhs_rows_index);
+  output_shape.dim(output_rank - 1) = extended_rhs_shape.Dims(rhs_cols_index);
+
+  output()->resize(output_shape);
+}
+
+void TransposeRowsColumns(const Tensor *tensor_in, Tensor *tensor_out)
+{
+  tflite::RuntimeShape transposed_shape(getTensorShape(tensor_in));
+  tflite::RuntimeShape shape(getTensorShape(tensor_in));
+  tflite::TransposeParams params;
+  int rank = shape.DimensionsCount();
+  params.perm_count = rank;
+  for (int i = 0; i < rank - 2; ++i)
+  {
+    params.perm[i] = i;
+  }
+  // Transpose the last two dimensions.
+  params.perm[rank - 2] = rank - 1;
+  params.perm[rank - 1] = rank - 2;
+  transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2));
+  transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1));
+  switch (tensor_in->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::Transpose(params, shape, getTensorData<float>(tensor_in),
+                                       transposed_shape, getTensorData<float>(tensor_out));
+      break;
+    default:
+      throw std::runtime_error("Only suppport fp32 BatchMatMul for now.");
+  }
+}
+
+void BatchMatMul::execute() const
+{
+  auto lhs = x();
+  auto rhs = y();
+
+  bool adj_x = params().adj_x;
+  bool adj_y = params().adj_y;
+
+  auto orig_lhs_shape = getTensorShape(lhs);
+  auto orig_rhs_shape = getTensorShape(rhs);
+
+  auto rhs_tensor = adj_y ? rhs : temp_rhs();
+  auto lhs_tensor = adj_x ? temp_lhs() : lhs;
+  if (not adj_y)
+  {
+    TransposeRowsColumns(rhs, temp_rhs());
+  }
+  if (adj_x)
+  {
+    TransposeRowsColumns(lhs, temp_lhs());
+  }
+  tflite::RuntimeShape rhs_shape = adj_y ? orig_rhs_shape : SwapRowColumnDims(orig_rhs_shape);
+  tflite::RuntimeShape lhs_shape = adj_x ? orig_lhs_shape : SwapRowColumnDims(orig_lhs_shape);
+
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::BatchMatMul(rhs_shape, getTensorData<float>(rhs_tensor), lhs_shape,
+                                        getTensorData<float>(lhs_tensor), getTensorShape(output()),
+                                        getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.h b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.h
new file mode 100644
index 000000000..744f49795
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H
+#define LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class BatchMatMul : public KernelWithParams<BatchMatMulParams>
+{
+public:
+  BatchMatMul(const Tensor *x, const Tensor *y, Tensor *output, Tensor *x_tmp, Tensor *y_tmp,
+              const BatchMatMulParams &params);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  Tensor *temp_lhs() const { return _outputs[1]; }
+  Tensor *temp_rhs() const { return _outputs[2]; }
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.test.cpp
new file mode 100644
index 000000000..edfa3a685
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchMatMul.test.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchMatMul.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class BatchMatMulTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(BatchMatMulTest, Float)
+{
+  std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6};
+  std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3}, lhs_data, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 4}, rhs_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(lhs_scratch);
+  _memory_manager->allocate_memory(rhs_scratch);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4}));
+}
+
+TEST_F(BatchMatMulTest, Float_SimpleRHSAdjoint)
+{
+  std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6};
+  std::vector<float> rhs_data = {7, 11, 15, 8, 12, 16, 9, 13, 17, 10, 14, 18};
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3}, lhs_data, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 4, 3}, rhs_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = true;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(lhs_scratch);
+  _memory_manager->allocate_memory(rhs_scratch);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4}));
+}
+
+TEST_F(BatchMatMulTest, Float_SimpleLHSAdjoint)
+{
+  std::vector<float> lhs_data = {1, 4, 2, 5, 3, 6};
+  std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 2}, lhs_data, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 4}, rhs_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = true;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(lhs_scratch);
+  _memory_manager->allocate_memory(rhs_scratch);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4}));
+}
+
+TEST_F(BatchMatMulTest, Float_BatchSizeTwo)
+{
+  std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::vector<float> rhs_data = {7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
+                                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30};
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 2, 3}, lhs_data, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3, 4}, rhs_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(lhs_scratch);
+  _memory_manager->allocate_memory(rhs_scratch);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218., 560., 584., 608., 632.,
+                              767., 800., 833., 866.}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 2, 4}));
+}
+
+TEST_F(BatchMatMulTest, Float_DiffBatch)
+{
+  std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::vector<float> rhs_data = {7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
+                                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30};
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 1, 6}, lhs_data, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 6, 4}, rhs_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(lhs_scratch);
+  _memory_manager->allocate_memory(rhs_scratch);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({427., 448., 469., 490., 1039., 1096., 1153., 1210.}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 4}));
+}
+
+TEST_F(BatchMatMulTest, Invalid_Shape_NEG)
+{
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 2}, {1, 2, 3, 4}, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 2}, {5, 6, 7, 8, 9, 10}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(BatchMatMulTest, Invalid_Batch_NEG)
+{
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 1, 3}, {1, 2, 3, 4, 5, 6}, _memory_manager.get());
+  Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({3, 3, 1}, {5, 6, 7, 8, 9, 10, 11, 12, 13},
+                                                         _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(BatchMatMulTest, Invalid_Rank_NEG)
+{
+  Tensor lhs_tensor = makeInputTensor<DataType::FLOAT32>({4}, {1, 2, 3, 4}, _memory_manager.get());
+  Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({1, 4, 2}, {5, 6, 7, 8, 9, 10, 11, 12},
+                                                         _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(BatchMatMulTest, Invalid_Rank2_NEG)
+{
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 1, 4}, {1, 2, 3, 4}, _memory_manager.get());
+  Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({1, 4, 2}, {5, 6, 7, 8, 9, 10, 11, 12},
+                                                         _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(BatchMatMulTest, TypeMisMatch_NEG)
+{
+  Tensor lhs_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, _memory_manager.get());
+  Tensor rhs_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 2}, {5, 6, 7, 8, 9, 10}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor lhs_scratch(DataType::U8, Shape({}), {}, "");
+  Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, "");
+
+  BatchMatMulParams params;
+  params.adj_x = false;
+  params.adj_y = false;
+
+  BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.cpp
new file mode 100644
index 000000000..bd315ff7b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchToSpaceND.h"
+#include "kernels/Utils.h"
+
+#include "PALBatchToSpaceND.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+namespace
+{
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
+} // namespace
+
+BatchToSpaceND::BatchToSpaceND(const Tensor *input, const Tensor *block_shape, const Tensor *crops,
+                               Tensor *output)
+  : Kernel({input, block_shape, crops}, {output})
+{
+}
+
+void BatchToSpaceND::configure()
+{
+
+  const auto *block_shape_data = block_shape()->data<int32_t>();
+  const auto *crops_data = crops()->data<int32_t>();
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= kInputMinDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= kInputMaxDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  int spatial_dims_num = input()->shape().num_dims() - 2;
+
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().dim(0) == spatial_dims_num);
+
+  LUCI_INTERPRETER_CHECK(crops()->shape().num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(crops()->shape().dim(0) == spatial_dims_num);
+  LUCI_INTERPRETER_CHECK(crops()->shape().dim(1) == 2);
+  for (int i = 0; i < spatial_dims_num * 2; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(crops_data[i] >= 0);
+  }
+
+  Shape output_shape = Shape(input()->shape().num_dims());
+  int output_batch_size = input()->shape().dim(0);
+  for (int i = 0; i < spatial_dims_num; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(output_batch_size % block_shape_data[i] == 0);
+    output_batch_size = output_batch_size / block_shape_data[i];
+    output_shape.dim(i + 1) =
+      input()->shape().dim(i + 1) * block_shape_data[i] - crops_data[i * 2] - crops_data[i * 2 + 1];
+  }
+
+  output_shape.dim(0) = output_batch_size;
+  output_shape.dim(input()->shape().num_dims() - 1) =
+    input()->shape().dim(input()->shape().num_dims() - 1);
+  output()->resize(output_shape);
+}
+
+void BatchToSpaceND::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::BatchToSpaceND(
+        getTensorShape(input()), getTensorData<float>(input()), getTensorShape(block_shape()),
+        getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
+        getTensorData<int32_t>(crops()), getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::BatchToSpaceND(
+        getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(block_shape()),
+        getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
+        getTensorData<int32_t>(crops()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.h b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.h
new file mode 100644
index 000000000..57703ea5d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class BatchToSpaceND : public Kernel
+{
+public:
+  BatchToSpaceND(const Tensor *input, const Tensor *block_shape, const Tensor *crops,
+                 Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *block_shape() const { return _inputs[1]; }
+  const Tensor *crops() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
new file mode 100644
index 000000000..52647a763
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchToSpaceND.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> block_shape_shape,
+           std::initializer_list<int32_t> crops_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T> input_data, std::initializer_list<int32_t> block_shape_data,
+           std::initializer_list<int32_t> crops_data, std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor crops_tensor =
+    makeInputTensor<DataType::S32>(crops_shape, crops_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T> class BatchToSpaceNDTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(BatchToSpaceNDTest, DataTypes);
+
+TYPED_TEST(BatchToSpaceNDTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{4, 2, 2, 1}, /*block_shape_shape=*/{2}, /*crops_shape=*/{2, 2},
+                   /*output_shape=*/{1, 4, 4, 1},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                   /*block_shape_data=*/{2, 2}, /*crops_data=*/{0, 0, 0, 0},
+                   /*output_data=*/{1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16});
+}
+
+TEST(BatchToSpaceNDTest, Invalid_Shape_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {3, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(BatchToSpaceNDTest, Invalid_Crops_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {4, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, -1, 0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/BinaryOpCommon.h b/compiler/luci-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
new file mode 100644
index 000000000..2d2842a9e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
+#define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+// Derived from tensorflow/lite/kernels/internal/reference/maximum_minimum.h (v2.3.0).
+template <typename T, typename Op, int N = 5>
+void BinaryOpBroadcastSlow(const tflite::RuntimeShape &unextended_input1_shape,
+                           const T *input1_data,
+                           const tflite::RuntimeShape &unextended_input2_shape,
+                           const T *input2_data,
+                           const tflite::RuntimeShape &unextended_output_shape, T *output_data,
+                           Op op)
+{
+  if (unextended_input1_shape == unextended_input2_shape)
+  {
+    const int flat_size = tflite::MatchingElementsSize(
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
+    for (int i = 0; i < flat_size; ++i)
+    {
+      output_data[i] = op(input1_data[i], input2_data[i]);
+    }
+  }
+  else
+  {
+    assert(unextended_input1_shape.DimensionsCount() <= N);
+    assert(unextended_input2_shape.DimensionsCount() <= N);
+    assert(unextended_output_shape.DimensionsCount() <= N);
+
+    tflite::NdArrayDesc<N> desc1{};
+    tflite::NdArrayDesc<N> desc2{};
+    tflite::NdArrayDesc<N> output_desc{};
+    tflite::NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape,
+                                                &desc1, &desc2);
+    tflite::CopyDimsToDesc(tflite::RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                           &output_desc);
+
+    auto fn = [&](int indexes[N]) {
+      output_data[SubscriptToIndex(output_desc, indexes)] =
+        op(input1_data[SubscriptToIndex(desc1, indexes)],
+           input2_data[SubscriptToIndex(desc2, indexes)]);
+    };
+    tflite::NDOpsHelper<N>(output_desc, fn);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/src/kernels/CMakeLists.txt
new file mode 100644
index 000000000..9f4ba0e0b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/CMakeLists.txt
@@ -0,0 +1,43 @@
+set(SOURCES
+        BinaryOpCommon.h
+        Utils.h
+        Utils.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/TestMemoryManager.h"
+        ${LUCI_INTERPRETER_SOURCE_DIR}/TestMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/SimpleMemoryManager.h"
+        ${LUCI_INTERPRETER_SOURCE_DIR}/SimpleMemoryManager.cpp)
+
+macro(REGISTER_KERNEL NODE)
+  list(APPEND SOURCES "${NODE}.h")
+  list(APPEND SOURCES "${NODE}.cpp")
+endmacro(REGISTER_KERNEL)
+
+include(${KERNEL_REGISTER_FILE})
+
+add_library(${LUCI_INTERPRETER_KERNELS} STATIC ${SOURCES})
+if (NOT NNCC_LIBRARY_NO_PIC)
+  set_target_properties(${LUCI_INTERPRETER_KERNELS} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif(NOT NNCC_LIBRARY_NO_PIC)
+target_include_directories(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_SOURCE_DIR})
+
+target_link_libraries(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_CORE})
+target_link_libraries(${LUCI_INTERPRETER_KERNELS} PRIVATE nncc_common)
+
+add_pal_to_target(${LUCI_INTERPRETER_KERNELS})
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+macro(REGISTER_KERNEL NODE)
+  list(APPEND TEST_SOURCES "${NODE}.test.cpp")
+endmacro(REGISTER_KERNEL)
+
+include(${KERNEL_REGISTER_FILE})
+
+list(APPEND TEST_SOURCES TestUtils.h TestUtils.cpp)
+
+GTest_AddTest(${LUCI_INTERPRETER_KERNELS}_test ${TEST_SOURCES})
+target_link_libraries(${LUCI_INTERPRETER_KERNELS}_test ${LUCI_INTERPRETER_KERNELS})
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Cast.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.cpp
new file mode 100644
index 000000000..39ee725dc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Cast.h"
+#include "kernels/Utils.h"
+
+namespace
+{
+
+using namespace luci_interpreter;
+using namespace luci_interpreter::kernels;
+
+template <typename InT, typename OutT>
+void cast_data(const InT *in_data, OutT *out_data, uint32_t elements_count)
+{
+  std::transform(in_data, in_data + elements_count, out_data,
+                 [](InT a) { return static_cast<OutT>(a); });
+}
+
+template <typename InT> void cast_from_pointer_to_tensor(const InT *in_data, Tensor *out_tensor)
+{
+  auto const out_type = out_tensor->element_type();
+  auto const elements_count = out_tensor->shape().num_elements();
+
+  switch (out_type)
+  {
+    case loco::DataType::U8:
+      cast_data(in_data, getTensorData<uint8_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::U16:
+      cast_data(in_data, getTensorData<uint16_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::U32:
+      cast_data(in_data, getTensorData<uint32_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::U64:
+      cast_data(in_data, getTensorData<uint64_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::S8:
+      cast_data(in_data, getTensorData<int8_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::S16:
+      cast_data(in_data, getTensorData<int16_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::S32:
+      cast_data(in_data, getTensorData<int32_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::S64:
+      cast_data(in_data, getTensorData<int64_t>(out_tensor), elements_count);
+      break;
+    case loco::DataType::FLOAT32:
+      cast_data(in_data, getTensorData<float>(out_tensor), elements_count);
+      break;
+    case loco::DataType::BOOL:
+      cast_data(in_data, getTensorData<bool>(out_tensor), elements_count);
+      break;
+    default:
+      throw std::runtime_error("Unsupported output type.");
+  }
+}
+
+void cast_from_tensor_to_tensor(const Tensor *in_tensor, Tensor *out_tensor)
+{
+  auto in_type = in_tensor->element_type();
+
+  switch (in_type)
+  {
+    case loco::DataType::U8:
+      cast_from_pointer_to_tensor(getTensorData<uint8_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::U16:
+      cast_from_pointer_to_tensor(getTensorData<uint16_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::U32:
+      cast_from_pointer_to_tensor(getTensorData<uint32_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::U64:
+      cast_from_pointer_to_tensor(getTensorData<uint64_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::S8:
+      cast_from_pointer_to_tensor(getTensorData<int8_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::S16:
+      cast_from_pointer_to_tensor(getTensorData<int16_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::S32:
+      cast_from_pointer_to_tensor(getTensorData<int32_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::S64:
+      cast_from_pointer_to_tensor(getTensorData<int64_t>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::FLOAT32:
+      cast_from_pointer_to_tensor(getTensorData<float>(in_tensor), out_tensor);
+      break;
+    case loco::DataType::BOOL:
+      cast_from_pointer_to_tensor(getTensorData<bool>(in_tensor), out_tensor);
+      break;
+    default:
+      throw std::runtime_error("Unsupported input type.");
+  }
+}
+
+} // namespace
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Cast::Cast(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Cast::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() != loco::DataType::Unknown);
+  LUCI_INTERPRETER_CHECK(output()->element_type() != loco::DataType::Unknown);
+
+  const Shape &shape = input()->shape();
+  output()->resize(shape);
+}
+
+void Cast::execute() const
+{
+  assert(input()->shape().num_elements() == output()->shape().num_elements());
+
+  cast_from_tensor_to_tensor(input(), output());
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Cast.h b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.h
new file mode 100644
index 000000000..f0bd02037
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_CAST_H
+#define LUCI_INTERPRETER_KERNELS_CAST_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Cast : public Kernel
+{
+public:
+  Cast(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_CAST_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Cast.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.test.cpp
new file mode 100644
index 000000000..4713ad34c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Cast.test.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Cast.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T1, typename T2>
+void Check(std::initializer_list<int32_t> shape, std::initializer_list<T1> input_data,
+           std::initializer_list<T2> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType input_type = getElementType<T1>();
+  constexpr DataType output_type = getElementType<T2>();
+
+  Tensor input_tensor = makeInputTensor<input_type>(shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(output_type);
+
+  Cast kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), shape);
+}
+
+template <typename T>
+void CheckBoolTo(std::initializer_list<int32_t> shape, std::initializer_list<bool> input_data,
+                 std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType input_type = loco::DataType::BOOL;
+  constexpr DataType output_type = getElementType<T>();
+  std::vector<typename DataTypeImpl<input_type>::Type> input_data_converted;
+  for (auto elem : input_data)
+  {
+    input_data_converted.push_back(elem);
+  }
+
+  Tensor input_tensor =
+    makeInputTensor<input_type>(shape, input_data_converted, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(output_type);
+
+  Cast kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), shape);
+}
+
+template <typename T> class CastTest : public ::testing::Test
+{
+};
+
+using IntDataTypes =
+  ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t>;
+TYPED_TEST_SUITE(CastTest, IntDataTypes);
+
+TYPED_TEST(CastTest, FloatToInt)
+{
+  Check<float, TypeParam>(/*shape=*/{1, 1, 1, 4},
+                          /*input_data=*/
+                          {
+                            1.0f, 9.0f, 7.0f, 3.0f, //
+                          },
+                          /*output_data=*/
+                          {
+                            1, 9, 7, 3, //
+                          });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToFloat)
+{
+  Check<TypeParam, float>(/*shape=*/{1, 1, 1, 4},
+                          /*input_data=*/
+                          {
+                            1, 9, 7, 3, //
+                          },
+                          /*output_data=*/
+                          {
+                            1.0f, 9.0f, 7.0f, 3.0f, //
+                          });
+  SUCCEED();
+}
+
+template <typename T1, typename T2> void check_int()
+{
+  Check<T1, T2>(/*shape=*/{1, 1, 1, 4},
+                /*input_data=*/
+                {
+                  1, 9, 7, 3, //
+                },
+                /*output_data=*/
+                {
+                  1, 9, 7, 3, //
+                });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToInt)
+{
+  check_int<TypeParam, uint8_t>();
+  check_int<TypeParam, uint16_t>();
+  check_int<TypeParam, uint32_t>();
+  check_int<TypeParam, uint64_t>();
+  check_int<TypeParam, int8_t>();
+  check_int<TypeParam, int16_t>();
+  check_int<TypeParam, int32_t>();
+  check_int<TypeParam, int64_t>();
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToBool)
+{
+  Check<TypeParam, bool>(/*shape=*/{1, 1, 1, 4},
+                         /*input_data=*/
+                         {
+                           1, 0, 7, 0, //
+                         },
+                         /*output_data=*/
+                         {
+                           true, false, true, false, //
+                         });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, BoolToInt)
+{
+  CheckBoolTo<TypeParam>(/*shape=*/{1, 1, 1, 4},
+                         /*input_data=*/
+                         {
+                           true, false, false, true, //
+                         },
+                         /*output_data=*/
+                         {
+                           1, 0, 0, 1, //
+                         });
+  SUCCEED();
+}
+
+TEST(CastTest, FloatToBool)
+{
+  Check<float, bool>(/*shape=*/{1, 1, 1, 4},
+                     /*input_data=*/
+                     {
+                       1.0f, 0.0f, 7.0f, 0.0f, //
+                     },
+                     /*output_data=*/
+                     {
+                       true, false, true, false, //
+                     });
+  SUCCEED();
+}
+
+TEST(CastTest, BoolToFloat)
+{
+  CheckBoolTo<float>(/*shape=*/{1, 1, 1, 4},
+                     /*input_data=*/
+                     {
+                       true, false, false, true, //
+                     },
+                     /*output_data=*/
+                     {
+                       1.0f, 0.0f, 0.0f, 1.0f, //
+                     });
+  SUCCEED();
+}
+
+TEST(CastTest, FloatToFloat)
+{
+  Check<float, float>(/*shape=*/{1, 1, 1, 4},
+                      /*input_data=*/
+                      {
+                        1.0f, 0.0f, 7.0f, 0.0f, //
+                      },
+                      /*output_data=*/
+                      {
+                        1.0f, 0.0f, 7.0f, 0.0f, //
+                      });
+  SUCCEED();
+}
+
+TEST(CastTest, BoolToBool)
+{
+  CheckBoolTo<bool>(/*shape=*/{1, 1, 1, 4},
+                    /*input_data=*/
+                    {
+                      true, true, false, false, //
+                    },
+                    /*output_data=*/
+                    {
+                      true, true, false, false, //
+                    });
+  SUCCEED();
+}
+
+TEST(CastTest, UnsupportedType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4},
+                                                           {
+                                                             1, 2, 7, 8, //
+                                                             1, 9, 7, 3, //
+                                                           },
+                                                           memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::Unknown);
+
+  Cast kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+  SUCCEED();
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.cpp
new file mode 100644
index 000000000..46ee5941e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Concatenation.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/concatenation.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Concatenation::Concatenation(std::vector<const Tensor *> inputs, Tensor *output,
+                             const ConcatenationParams &params)
+  : KernelWithParams<ConcatenationParams>(std::move(inputs), {output}, params)
+{
+}
+
+void Concatenation::configure()
+{
+  const int num_inputs = _inputs.size();
+  LUCI_INTERPRETER_CHECK(num_inputs > 0);
+  const Tensor *t0 = _inputs[0];
+
+  // TODO: Support concat with fused activation function
+  LUCI_INTERPRETER_CHECK(params().activation == luci::FusedActFunc::NONE);
+
+  int axis = _params.axis;
+  if (axis < 0)
+    axis += t0->shape().num_dims();
+  LUCI_INTERPRETER_CHECK(axis >= 0 && axis < t0->shape().num_dims());
+
+  int32_t sum_axis = t0->shape().dim(axis);
+  for (int i = 1; i < num_inputs; ++i)
+  {
+    const Tensor *tensor = _inputs[i];
+    LUCI_INTERPRETER_CHECK(tensor->element_type() == t0->element_type());
+    LUCI_INTERPRETER_CHECK(tensor->shape().num_dims() == t0->shape().num_dims());
+    for (int d = 0; d < t0->shape().num_dims(); ++d)
+    {
+      if (d == axis)
+      {
+        sum_axis += tensor->shape().dim(axis);
+      }
+      else
+      {
+        LUCI_INTERPRETER_CHECK(tensor->shape().dim(d) == t0->shape().dim(d));
+      }
+    }
+  }
+
+  Shape output_shape = t0->shape();
+  output_shape.dim(axis) = sum_axis;
+
+  // If input tensors are INT8 type then quantization parameters of all input tensors and the output
+  // should be the same
+  for (auto current_tensor : _inputs)
+  {
+    if (current_tensor->element_type() == DataType::S8)
+    {
+      LUCI_INTERPRETER_CHECK(current_tensor->quantized_dimension() ==
+                             output()->quantized_dimension());
+
+      LUCI_INTERPRETER_CHECK(current_tensor->zero_points().size() ==
+                             current_tensor->scales().size());
+      LUCI_INTERPRETER_CHECK(current_tensor->zero_points() == output()->zero_points());
+      LUCI_INTERPRETER_CHECK(current_tensor->scales() == output()->scales());
+    }
+  }
+  output()->resize(output_shape);
+}
+
+void Concatenation::execute() const
+{
+  switch (_inputs[0]->element_type())
+  {
+    case DataType::FLOAT32:
+      evalGeneric<float>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S8:
+      evalGeneric<int8_t>();
+      break;
+    case DataType::S32:
+      evalGeneric<int32_t>();
+      break;
+    case DataType::S64:
+      evalGeneric<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void Concatenation::evalGeneric() const
+{
+  int axis = _params.axis;
+  if (axis < 0)
+    axis += output()->shape().num_dims();
+
+  VectorOfTensors<T, true> inputs(_inputs);
+  tflite::ConcatenationParams params{};
+  params.axis = axis;
+  params.inputs_count = _inputs.size();
+  tflite::reference_ops::Concatenation(params, inputs.shapes(), inputs.data(),
+                                       getTensorShape(output()), getTensorData<T>(output()));
+}
+
+void Concatenation::evalQuantized() const
+{
+  int axis = _params.axis;
+  if (axis < 0)
+    axis += output()->shape().num_dims();
+
+  VectorOfQuantizedTensors<true> inputs(_inputs);
+  tflite::ConcatenationParams params{};
+  params.axis = axis;
+  params.input_zeropoint = inputs.zero_point();
+  params.input_scale = inputs.scale();
+  params.inputs_count = _inputs.size();
+  params.output_zeropoint = output()->zero_point();
+  params.output_scale = output()->scale();
+
+  tflite::reference_ops::ConcatenationWithScaling(params, inputs.shapes(), inputs.data(),
+                                                  getTensorShape(output()),
+                                                  getTensorData<uint8_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.h b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.h
new file mode 100644
index 000000000..b48c8ed1e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_CONCATENATION_H
+#define LUCI_INTERPRETER_KERNELS_CONCATENATION_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Concatenation : public KernelWithParams<ConcatenationParams>
+{
+public:
+  Concatenation(std::vector<const Tensor *> inputs, Tensor *output,
+                const ConcatenationParams &params);
+
+  const Tensor *input(int index) const { return _inputs[index]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalGeneric() const;
+  void evalQuantized() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_CONCATENATION_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.test.cpp
new file mode 100644
index 000000000..f893b38fd
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Concatenation.test.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Concatenation.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ConcatenationTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ConcatenationTest, Float)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  // Try different 'axis' and expect different results.
+  {
+    params.axis = 0;
+    params.activation = luci::FusedActFunc::NONE;
+
+    Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+    kernel.configure();
+    for (auto t : kernel.getOutputTensors())
+    {
+      _memory_manager->allocate_memory(*t);
+    }
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor),
+                FloatArrayNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+  }
+  {
+    params.axis = -2; // Same as '0'.
+    params.activation = luci::FusedActFunc::NONE;
+
+    Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor),
+                FloatArrayNear({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+  }
+  {
+    params.axis = 1;
+    params.activation = luci::FusedActFunc::NONE;
+
+    Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor),
+                FloatArrayNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+  }
+  {
+    params.axis = -1; // Same as '1'.
+    params.activation = luci::FusedActFunc::NONE;
+
+    Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor),
+                FloatArrayNear({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+  }
+}
+
+TEST_F(ConcatenationTest, Input_Number_Check_NEG)
+{
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Invalid_Axis_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -3;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Mismatching_Input_Type_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<uint8_t> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U8>({2, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Mismatching_Input_Dimension_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12, 13, 14, 15};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({3, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Int8_Mismatching_Input_Type_NEG)
+{
+  std::vector<uint8_t> input1_data{1, 2, 3, 4};
+  std::vector<int8_t> input2_data{5, 6, 7, 8};
+  Tensor input1_tensor = makeInputTensor<DataType::U8>({2, 2}, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 2}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Int8_Mismatching_Input_Output_Quant_Params_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  int quantized_dimension = 3;
+  std::vector<float> scales{0.1, 0.2, 0.3};
+  std::vector<int32_t> zero_points{1, -1, 1};
+
+  Tensor input1_tensor = makeInputTensor<DataType::S8>(
+    {1, 1, 2, 3}, scales, zero_points, quantized_dimension, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S8>(
+    {1, 1, 2, 3}, scales, zero_points, quantized_dimension, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, scales.at(0), zero_points.at(0));
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ConcatenationTest, Int8_Mismatching_Zero_Point_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4};
+  std::vector<float> input2_data{5, 6, 7, 8};
+  float scale = 0.1;
+  int32_t zero_point_1 = 1;
+  int32_t zero_point_2 = -1;
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S8>({2, 2}, scale, zero_point_1, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::S8>({2, 2}, scale, zero_point_2, input2_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::S8, scale, zero_point_1);
+  ConcatenationParams params{};
+
+  params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+// TODO: Remove this test when concat w/ fused_activation is supported
+TEST_F(ConcatenationTest, With_Fused_Activation_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = 1;
+  params.activation = luci::FusedActFunc::RELU;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.cpp
new file mode 100644
index 000000000..234f95425
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Conv2D.h"
+
+#include "kernels/Utils.h"
+
+#include "PALConv2d.h"
+
+#include <stdexcept>
+#include <thread>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
+               Tensor *scratchpad, const Conv2DParams &params)
+  : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, scratchpad}, params)
+{
+}
+
+void Conv2D::configure()
+{
+  // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
+  //     | input filter bias  output |
+  // ----+---------------------------+
+  // (1) | float float  float float  |
+  // (2) | float int8   float float  | hybrid
+  // (3) | uint8 uint8  int32 uint8  | quantized
+  // (4) | int8  int8   int32 int8   | quantized per channel
+  //
+  // We only support (1), (3) and (4) for now, and additionally the following:
+  //     | input filter bias  output |
+  // ----+---------------------------+
+  // (5) | int16 int16  int64 int16  |
+  //
+  if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
+  }
+  else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+  }
+  else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+    LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+    LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                           static_cast<size_t>(filter()->shape().dim(0)));
+    for (auto zerop : filter()->zero_points())
+    {
+      LUCI_INTERPRETER_CHECK(zerop == 0);
+    }
+  }
+  else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+  LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
+
+  LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
+                                               bias()->shape().dim(0) == output_depth));
+
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
+                      _params.dilation_height_factor);
+  const int32_t output_width =
+    computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
+                      _params.dilation_width_factor);
+
+  _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
+                                   input_height, filter_height, output_height);
+  _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
+                                  filter_width, output_width);
+
+  output()->resize({batches, output_height, output_width, output_depth});
+
+  // Allocate tensor for scratchpad, if needed.
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  auto scratchpad = getOutputTensors()[1];
+  luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), params,
+                                              getTensorShape(input()), getTensorShape(filter()),
+                                              getTensorShape(output()));
+
+  switch (_params.activation)
+  {
+    case Activation::NONE:
+    case Activation::RELU:
+    case Activation::RELU6:
+    case Activation::RELU_N1_TO_1:
+      break;
+    default:
+      throw std::runtime_error("Unsupported fused activation");
+  }
+}
+
+void Conv2D::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      if (filter()->element_type() == DataType::FLOAT32)
+      {
+        evalFloat();
+        break;
+      }
+      throw std::runtime_error("Unsupported type.");
+    case DataType::U8:
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(0)));
+        evalQuantizedPerChannel();
+      }
+      break;
+    case DataType::S8:
+      evalQuantizedS8PerChannel();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Conv2D::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+
+  auto scratchpad = getOutputTensors()[1];
+  float *scratchpad_data = nullptr;
+  if (scratchpad->is_allocatable())
+    scratchpad_data = scratchpad->data<float>();
+
+  luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
+                             getTensorShape(filter()), getTensorData<float>(filter()),
+                             getTensorShape(bias()), getTensorData<float>(bias()),
+                             getTensorShape(output()), getTensorData<float>(output()),
+                             getTensorShape(scratchpad), scratchpad_data);
+}
+
+void Conv2D::evalQuantized() const
+{
+  const auto input_scale = static_cast<double>(input()->scale());
+  const auto filter_scale = static_cast<double>(filter()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const double real_multiplier = input_scale * filter_scale / output_scale;
+  int32_t output_multiplier{};
+  int output_shift{};
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  // The kernel expects input and filter zero points to be negated.
+  params.input_offset = -input()->zero_point();    // Note the '-'.
+  params.weights_offset = -filter()->zero_point(); // Note the '-'.
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  auto scratchpad = getOutputTensors()[1];
+  luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                             getTensorShape(filter()), getTensorData<uint8_t>(filter()),
+                             getTensorShape(bias()), getTensorData<int32_t>(bias()),
+                             getTensorShape(output()), getTensorData<uint8_t>(output()),
+                             getTensorShape(scratchpad), getTensorData<uint8_t>(scratchpad));
+}
+
+void Conv2D::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  const std::vector<double> effective_output_scale =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  const std::vector<ChannelQuantMultipliers> multipliers_raw =
+    quantizeMultipliers(effective_output_scale);
+  BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
+
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          const int32_t in_y_origin = out_y * stride_height - _padding_height;
+          const int32_t in_x_origin = out_x * stride_width - _padding_width;
+          int32_t acc = 0;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
+              const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
+              if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
+              {
+                for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+                {
+                  const uint8_t input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const uint8_t filter_val =
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  acc += static_cast<int32_t>(input_val - input()->zero_point()) *
+                         static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
+                }
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+            acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
+
+          scaled_acc += output()->zero_point();
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
+void Conv2D::evalQuantizedS8PerChannel() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  // The kernel expects filter zero points to be negated.
+  params.input_offset = -input()->zero_point(); // Note the '-'.
+  params.weights_offset = 0;                    // Unused in tflite code
+  params.output_offset = output()->zero_point();
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  auto scratchpad = getOutputTensors()[1];
+  int8_t *scratchpad_data = nullptr;
+  if (scratchpad->is_allocatable())
+    scratchpad_data = scratchpad->data<int8_t>();
+
+  luci_interpreter_pal::ConvPerChannel(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
+}
+
+void Conv2D::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  const auto *filter_data = getTensorData<int16_t>(filter());
+  const auto *bias_data = getTensorData<int64_t>(bias());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  const std::vector<double> effective_output_scale =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  const std::vector<ChannelQuantMultipliers> multipliers_raw =
+    quantizeMultipliers(effective_output_scale);
+  BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
+
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          const int32_t in_y_origin = out_y * stride_height - _padding_height;
+          const int32_t in_x_origin = out_x * stride_width - _padding_width;
+          int64_t acc = 0;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
+              const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
+              if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
+              {
+                for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+                {
+                  const int16_t input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const int16_t filter_val =
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
+                }
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+            acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
+
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.h
new file mode 100644
index 000000000..330bf3a2a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_CONV2D_H
+#define LUCI_INTERPRETER_KERNELS_CONV2D_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <memory>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Conv2D : public KernelWithParams<Conv2DParams>
+{
+public:
+  Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
+         Tensor *scratchpad, const Conv2DParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *filter() const { return _inputs[1]; }
+  const Tensor *bias() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
+  void evalQuantizedS8PerChannel() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _padding_height{};
+  int32_t _padding_width{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_CONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.test.cpp
new file mode 100644
index 000000000..0fe6ef795
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Conv2D.test.cpp
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Conv2D.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class Conv2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(Conv2DTest, Float)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(im2col);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    11, 16, 7, 20, // row = 0
+    0,  40, 0, 44, // row = 1
+  };
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, FloatPointwise)
+{
+  Shape input_shape{1, 2, 2, 2};
+  Shape filter_shape{2, 1, 1, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1, 2, // row = 0, col = 0
+    3, 4, // row = 0, col = 1
+    5, 6, // row = 1, col = 0
+    7, 8, // row = 1, col = 1
+  };
+  std::vector<float> filter_data{
+    -1, 2, // out = 0
+    -3, 4, // out = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(im2col);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    4, 7,  6,  9,  // row = 0
+    8, 11, 10, 13, // row = 1
+  };
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, FloatCheck)
+{
+  Shape input_shape{2, 2, 4, 1};
+  Shape filter_shape{3, 2, 2, 1};
+  Shape bias_shape{3};
+  std::vector<float> input_data{
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+    // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, Uint8)
+{
+  std::vector<float> input_data{
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::U8>({3, 2, 2, 1}, input_quant_param.first, input_quant_param.second,
+                                  filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+    {3}, input_quant_param.first * input_quant_param.first, 0, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::U8, Shape({}), {}, "");
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, Uint8_CWQ)
+{
+  const int output_channels = 3;
+  std::vector<float> input_data{
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Shape filter_shape{output_channels, 2, 2, 1};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(0, 4);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 4));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-1, 1));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-1, 1));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops,
+                                                       0, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, _memory_manager.get());
+  Tensor im2col(DataType::U8, Shape({}), {}, "");
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, SInt8_CWQ)
+{
+  const int output_channels = 3;
+  std::vector<float> input_data{
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Shape filter_shape{output_channels, 2, 2, 1};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(0, 4);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.5, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.25, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.125, 0));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S8>(filter_shape, filter_scales, filter_zerops,
+                                                       0, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, _memory_manager.get());
+  Tensor im2col(DataType::S8, Shape({}), {}, "");
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, SInt16)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
+
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  std::vector<float> ref_output_data{
+    11, 16, 7, 20, // row = 0
+    0,  40, 0, 44, // row = 1
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::S16, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(Conv2DTest, SInt16_CWQ_weights)
+{
+  Shape input_shape{1, 2, 2, 2};  // Batch x H x W x C
+  Shape filter_shape{3, 1, 1, 2}; // Out channels x H x W x In Channels
+  Shape bias_shape{3};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 3};
+
+  std::vector<float> input_data{
+    1, 2, // row = 0, col 0
+    3, 4, // row = 0, col 1
+    5, 6, // row = 1, col 0
+    7, 8, // row = 1, col 1
+  };
+  std::vector<float> filter_data{
+    4, -3, // out = 0
+    1, -3, // out = 1
+    5, -3, // out = 2
+  };
+  std::vector<float> bias_data{1, 10, 5};
+  std::vector<float> ref_output_data{
+    0, 5, 4,  // row 0, col 0
+    1, 1, 8,  // row 0, col 1
+    3, 0, 12, // row 1, col 0
+    5, 0, 16, // row 1, col 1
+  };
+
+  float input_scale = 0.25f;
+  float output_scale = 0.05f;
+  std::vector<float> filter_scales = {0.25f, 0.2f, 0.1f};
+  std::vector<float> bias_scales;
+  for (int i = 0; i < filter_scales.size(); ++i)
+    bias_scales.push_back(filter_scales[i] * input_scale);
+  std::vector<int32_t> zerop = {0, 0, 0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0,
+                                                        filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor im2col(DataType::S16, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(Conv2DTest, Unsupported_Type_Configure_NEG)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<int32_t> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(Conv2DTest, Invalid_Bias_Type_NEG)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<uint8_t> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::U8>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(Conv2DTest, Invalid_Bias_Data_NEG)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{3};
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(Conv2DTest, Invalid_Input_Shape_NEG)
+{
+  Shape input_shape{1, 4, 6, 1};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(Conv2DTest, Invalid_fused_act_tanh_NEG)
+{
+  Shape input_shape{1, 4, 3, 2};
+  Shape filter_shape{2, 2, 2, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
+  };
+  std::vector<float> filter_data{
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::TANH;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.cpp
new file mode 100644
index 000000000..3a9acd1d4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthToSpace.h"
+#include "Utils.h"
+#include "PALDepthToSpace.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
+  : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
+{
+}
+
+void DepthToSpace::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32 ||
+                         output()->element_type() == DataType::U8)
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type())
+  const int block_size = params().block_size;
+  const int32_t input_height = input()->shape().dim(1);
+  const int32_t input_width = input()->shape().dim(2);
+  const int32_t input_channels = input()->shape().dim(3);
+  int32_t output_height = input_height * block_size;
+  int32_t output_width = input_width * block_size;
+  int32_t output_channels = input_channels / block_size / block_size;
+
+  LUCI_INTERPRETER_CHECK(input_height == output_height / block_size);
+  LUCI_INTERPRETER_CHECK(input_width == output_width / block_size);
+  LUCI_INTERPRETER_CHECK(input_channels == output_channels * block_size * block_size);
+
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = output_height;
+  output_shape.dim(2) = output_width;
+  output_shape.dim(3) = output_channels;
+
+  output()->resize(output_shape);
+}
+
+void DepthToSpace::execute() const
+{
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = params().block_size;
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::DepthToSpace(op_params, getTensorShape(input()),
+                                         getTensorData<float>(input()), getTensorShape(output()),
+                                         getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::DepthToSpace(op_params, getTensorShape(input()),
+                                         getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                         getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported Type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.h
new file mode 100644
index 000000000..63ce37610
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
+{
+public:
+  DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.test.cpp
new file mode 100644
index 000000000..88e6e07f1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthToSpace.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class DepthToSpaceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(DepthToSpaceTest, DataTypes);
+
+TYPED_TEST(DepthToSpaceTest, SimpleCase)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+  std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
+  std::vector<int32_t> output_shape{1, 2, 4, 1};
+
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(DepthToSpaceTest, InvalidInputShape_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 2, 4};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthToSpaceTest, InOutTypeMismatch_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(DepthToSpaceTest, InvalidBlockSize_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthToSpaceParams params{};
+  params.block_size = 3;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..c554c309d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthwiseConv2D.h"
+
+#include "kernels/Utils.h"
+
+#include "PALDepthwiseConv2d.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
+                                 Tensor *output, Tensor *scratchpad,
+                                 const DepthwiseConv2DParams &params)
+  : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output, scratchpad}, params)
+{
+}
+
+void DepthwiseConv2D::configure()
+{
+  // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
+  //     | input filter bias  output |
+  // ----+---------------------------+
+  // (1) | float float  float float  |
+  // (2) | float int8   float float  | hybrid
+  // (3) | uint8 uint8  int32 uint8  | quantized
+  // (4) | int8  int8   int32 int8   | quantized per channel
+  // (5) | int16 int8   int64 int16  | quantized per channel 16x8
+  //
+  // We only support (1), (3) and (4) for now, and additionally the following:
+  //     | input filter bias  output |
+  // ----+---------------------------+
+  // (5) | int16 int16  int64 int16  |
+  //
+  if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
+  }
+  else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+  }
+  else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+    LUCI_INTERPRETER_CHECK(static_cast<uint32_t>(filter()->shape().dim(3)) ==
+                           filter()->scales().size());
+    for (auto zerop : filter()->zero_points())
+    {
+      LUCI_INTERPRETER_CHECK(zerop == 0);
+    }
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+  }
+  else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+  LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  // Filter format: [1, H, W, O].
+  LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t channels_out = filter_shape.dim(3);
+
+  LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
+                                               bias()->shape().dim(0) == channels_out));
+
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
+                      _params.dilation_height_factor);
+  const int32_t output_width =
+    computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
+                      _params.dilation_width_factor);
+
+  _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
+                                   input_height, filter_height, output_height);
+  _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
+                                  filter_width, output_width);
+
+  output()->resize({batches, output_height, output_width, channels_out});
+
+  tflite::DepthwiseParams params{};
+
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+
+  auto scratchpad = getOutputTensors()[1];
+  luci_interpreter_pal::SetupScratchpadTensor(scratchpad, params, input()->element_type(),
+                                              getTensorShape(input()), getTensorShape(filter()),
+                                              getTensorShape(output()));
+}
+
+void DepthwiseConv2D::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      if (filter()->element_type() == DataType::FLOAT32)
+      {
+        evalFloat();
+        break;
+      }
+      throw std::runtime_error("Unsupported type.");
+    case DataType::U8:
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(3)));
+        evalQuantizedPerChannel();
+      }
+      break;
+    case DataType::S8:
+      evalQuantizedS8PerChannel();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void DepthwiseConv2D::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::DepthwiseParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  params.depth_multiplier = _params.depth_multiplier;
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+    getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void DepthwiseConv2D::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+  const int32_t depth_multiplier = _params.depth_multiplier;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
+    quantizeMultipliers(effective_output_scales);
+  BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          for (int m = 0; m < depth_multiplier; ++m)
+          {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - _padding_width;
+            const int in_y_origin = (out_y * stride_height) - _padding_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y = in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+                if (is_point_inside_image)
+                {
+                  int32 input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
+                  int32 filter_val =
+                    filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += (filter_val - filter()->zero_points()[output_channel]) *
+                         (input_val - input()->zero_point());
+                }
+              }
+            }
+            if (bias_data)
+            {
+              acc += bias_data[output_channel];
+            }
+            int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
+            int output_shift = quant_multipliers[output_channel].shift;
+            int32_t scaled_acc =
+              tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+            scaled_acc += output()->zero_point();
+            scaled_acc = std::max(scaled_acc, activation_min);
+            scaled_acc = std::min(scaled_acc, activation_max);
+            output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
+              static_cast<uint8_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+void DepthwiseConv2D::evalQuantized() const
+{
+  const auto input_scale = static_cast<double>(input()->scale());
+  const auto filter_scale = static_cast<double>(filter()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const double real_multiplier = input_scale * filter_scale / output_scale;
+  int32_t output_multiplier{};
+  int output_shift{};
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::DepthwiseParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  params.depth_multiplier = _params.depth_multiplier;
+  // The kernel expects input and filter zero points to be negated.
+  params.input_offset = -input()->zero_point();    // Note the '-'.
+  params.weights_offset = -filter()->zero_point(); // Note the '-'.
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+    params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
+    getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+void DepthwiseConv2D::evalQuantizedS8PerChannel() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::DepthwiseParams params{};
+
+  params.padding_type = tflite::PaddingType::kSame;
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  params.depth_multiplier = _params.depth_multiplier;
+  // The kernel expects input and filter zero points to be negated.
+  params.input_offset = -input()->zero_point(); // Note the '-'.
+  params.weights_offset = 0;
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = 1; // unused in tflite code
+  params.output_shift = 0;      // unused in tflite code
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  auto scratchpad = getOutputTensors()[1];
+  int8_t *scratchpad_data = nullptr;
+  if (scratchpad->is_allocatable())
+    scratchpad_data = scratchpad->data<int8_t>();
+
+  luci_interpreter_pal::DepthwiseConvPerChannel<int8_t>(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
+}
+
+void DepthwiseConv2D::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  const auto *filter_data = getTensorData<int16_t>(filter());
+  const auto *bias_data = getTensorData<int64_t>(bias());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+  const int32_t dilation_height_factor = _params.dilation_height_factor;
+  const int32_t dilation_width_factor = _params.dilation_width_factor;
+  const int32_t depth_multiplier = _params.depth_multiplier;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
+    quantizeMultipliers(effective_output_scales);
+
+  BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+        {
+          for (int32_t m = 0; m < depth_multiplier; ++m)
+          {
+            const int32_t out_c = m + in_c * depth_multiplier;
+            const int32_t in_y_origin = out_y * stride_height - _padding_height;
+            const int32_t in_x_origin = out_x * stride_width - _padding_width;
+            int64_t acc = 0;
+            for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
+                const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
+                if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
+                {
+                  const int16_t input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const int16_t filter_val =
+                    filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
+                  acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
+                }
+              }
+            }
+            if (bias_data != nullptr)
+            {
+              acc += bias_data[out_c];
+            }
+
+            int32_t output_multiplier = quant_multipliers[out_c].multiplier;
+            int output_shift = quant_multipliers[out_c].shift;
+            int32_t scaled_acc =
+              tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+
+            scaled_acc = std::max(scaled_acc, activation_min);
+            scaled_acc = std::min(scaled_acc, activation_max);
+
+            output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.h b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.h
new file mode 100644
index 000000000..3d1faf6c1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEPTHWISECONV2D_H
+#define LUCI_INTERPRETER_KERNELS_DEPTHWISECONV2D_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class DepthwiseConv2D : public KernelWithParams<DepthwiseConv2DParams>
+{
+public:
+  DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
+                  Tensor *scratchpad, const DepthwiseConv2DParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *filter() const { return _inputs[1]; }
+  const Tensor *bias() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
+  void evalQuantizedS8PerChannel() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _padding_height{};
+  int32_t _padding_width{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEPTHWISECONV2D_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
new file mode 100644
index 000000000..6b4673f3e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthwiseConv2D.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class DepthwiseConv2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(DepthwiseConv2DTest, Float)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(scratchpad);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
+  };
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
+}
+
+TEST_F(DepthwiseConv2DTest, Uint8)
+{
+  std::vector<float> input_data{
+    1, 2, 7,  8,  // column 1
+    3, 4, 9,  10, // column 2
+    5, 6, 11, 12, // column 3
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 3, 2, 2}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 2, 4}, input_quant_param.first, input_quant_param.second,
+                                  filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+    {4}, input_quant_param.first * input_quant_param.first, 0, bias_data, _memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
+  };
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
+}
+
+TEST_F(DepthwiseConv2DTest, SInt16)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, 4};
+
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+  Tensor scratchpad(DataType::S64, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(DepthwiseConv2DTest, SInt16_CWQ_weights)
+{
+  const int output_channels = 4;
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, output_channels};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
+
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
+  };
+
+  float input_scale = 0.25;
+  std::vector<float> filter_scales{0.2f, 1.f, 0.5f, 0.1f};
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_scales[i] * input_scale);
+  std::vector<int32_t> zerop(4, 0);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 3,
+                                                        filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+  Tensor scratchpad(DataType::S16, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(DepthwiseConv2DTest, Uint8_CWQ_weights)
+{
+  const int output_channels = 4;
+  Shape input_shape{1, 3, 2, 2};
+  Shape filter_shape{1, 2, 2, output_channels};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
+
+  std::vector<float> input_data{
+    1, 2, 7,  8,  //
+    3, 4, 9,  10, //
+    5, 6, 11, 12, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
+  };
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(0, 16);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-9, 13));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-14, 10));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-11, 15));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(-16, 12));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops,
+                                                       3, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+  Tensor scratchpad(DataType::U8, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, output_quant_param.first));
+}
+
+TEST_F(DepthwiseConv2DTest, SInt8_CWQ_weights)
+{
+  const int output_channels = 4;
+  Shape input_shape{1, 3, 2, 2};
+  Shape filter_shape{1, 2, 2, output_channels};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
+
+  std::vector<float> input_data{
+    1, 2, 7,  8,  //
+    3, 4, 9,  10, //
+    5, 6, 11, 12, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
+  };
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-128, 127);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.5, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.25, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(1, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.125, 0));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S8>(filter_shape, filter_scales, filter_zerops,
+                                                       3, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+  Tensor scratchpad(DataType::S8, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, output_quant_param.first));
+}
+
+TEST_F(DepthwiseConv2DTest, InvalidBiasType_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<int32_t> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+  Tensor scratchpad(DataType::U8, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DepthwiseConv2DTest, InvalidInputShape_NEG)
+{
+  Shape input_shape{4, 2, 2};
+  Shape filter_shape{2, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DepthwiseConv2DTest, InvalidFilterShape_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{2, 1, 2, 4};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DepthwiseConv2DTest, InvalidBiasDim_NEG)
+{
+  Shape input_shape{1, 4, 2, 2};
+  Shape filter_shape{1, 2, 4, 2};
+  Shape bias_shape{4};
+  std::vector<float> input_data{
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, "");
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 2;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad,
+                         params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.cpp
new file mode 100644
index 000000000..96399e5c7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Dequantize.h"
+#include "kernels/Utils.h"
+#include "PALDequantize.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Dequantize::Dequantize(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Dequantize::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == loco::DataType::S8 ||
+                         input()->element_type() == loco::DataType::U8 ||
+                         input()->element_type() == loco::DataType::S16);
+
+  LUCI_INTERPRETER_CHECK(input()->scales().size() == 1);
+
+  if (input()->element_type() == loco::DataType::S16)
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0);
+
+  LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::FLOAT32);
+
+  output()->resize(input()->shape());
+}
+
+void Dequantize::execute() const
+{
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = input()->zero_point();
+  op_params.scale = input()->scale();
+
+  switch (input()->element_type())
+  {
+    case loco::DataType::U8:
+    {
+      luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()),
+                                       getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                       getTensorData<float>(output()));
+      break;
+    }
+    case loco::DataType::S8:
+    {
+      luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()),
+                                       getTensorData<int8_t>(input()), getTensorShape(output()),
+                                       getTensorData<float>(output()));
+      break;
+    }
+    case loco::DataType::S16:
+    {
+      luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()),
+                                       getTensorData<int16_t>(input()), getTensorShape(output()),
+                                       getTensorData<float>(output()));
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.h b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.h
new file mode 100644
index 000000000..5565df0e4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H
+#define LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Dequantize : public Kernel
+{
+public:
+  Dequantize(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.test.cpp
new file mode 100644
index 000000000..0cab633d6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Dequantize.test.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Dequantize.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class DequantizeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(DequantizeTest, Uint8)
+{
+  std::vector<uint8_t> input_data{0, 1, 2, 3, 4, 251, 252, 253, 254, 255};
+
+  std::vector<float> ref_output_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64};
+
+  Tensor input_tensor(loco::DataType::U8, {2, 5}, {{0.5}, {127}}, "");
+
+  _memory_manager->allocate_memory(input_tensor);
+  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(uint8_t));
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(DequantizeTest, Sint8)
+{
+  std::vector<int8_t> input_data{-128, -127, -126, -125, -124, 123, 124, 125, 126, 127};
+
+  std::vector<float> ref_output_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64};
+
+  Tensor input_tensor(loco::DataType::S8, {2, 5}, {{0.5}, {-1}}, "");
+
+  _memory_manager->allocate_memory(input_tensor);
+  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int8_t));
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(DequantizeTest, Sint16)
+{
+  std::vector<int16_t> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131};
+
+  std::vector<float> ref_output_data{-64.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 65.5};
+
+  Tensor input_tensor(loco::DataType::S16, {2, 5}, {{0.5}, {0}}, "");
+
+  _memory_manager->allocate_memory(input_tensor);
+  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int16_t));
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(DequantizeTest, InvalidInputType_NEG)
+{
+  std::vector<float> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DequantizeTest, InvalidOutputType_NEG)
+{
+  std::vector<int16_t> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131};
+
+  Tensor input_tensor(loco::DataType::S16, {2, 5}, {{0.5}, {0}}, "");
+
+  _memory_manager->allocate_memory(input_tensor);
+  input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int16_t));
+
+  Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DequantizeTest, InvalidInputZeroPoint_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({2, 5}, 0.5, -1, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Dequantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Div.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Div.cpp
new file mode 100644
index 000000000..dd1532278
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Div.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Div.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/div.h>
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Div::Div(const Tensor *input1, const Tensor *input2, Tensor *output, const DivParams &params)
+  : KernelWithParams<DivParams>({input1, input2}, {output}, params)
+{
+}
+
+void Div::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Div::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Div::evalFloat() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<float>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastDivSlow(
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                               getTensorShape(input2()), getTensorData<float>(input2()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+template <typename T> void Div::evalInteger() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<T>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastDivSlow(
+      params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
+      getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<T>(input1()),
+                               getTensorShape(input2()), getTensorData<T>(input2()),
+                               getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+void Div::evalQuantized() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const double real_output_multiplier = input1_scale / (input2_scale * output_scale);
+
+  int32_t output_multiplier{};
+  int output_shift{};
+
+  quantizeMultiplier(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+
+  params.input1_offset = -input1()->zero_point(); // Note the '-'.
+  params.input2_offset = -input2()->zero_point(); // Note the '-'.
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastDivSlow(
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+                               getTensorShape(input2()), getTensorData<uint8_t>(input2()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Div.h b/compiler/luci-micro/luci-interpreter/src/kernels/Div.h
new file mode 100644
index 000000000..c1bf3e10b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Div.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DIV_H
+#define LUCI_INTERPRETER_KERNELS_DIV_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Div : public KernelWithParams<DivParams>
+{
+public:
+  Div(const Tensor *input1, const Tensor *input2, Tensor *output, const DivParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DIV_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Div.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Div.test.cpp
new file mode 100644
index 000000000..85cd8b90a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Div.test.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Div.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class DivTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+float GetTolerance(float min, float max)
+{
+  const float kQuantizedStep = (max - min) / 255.0f;
+  const float kQuantizedTolerance = 2.0f * kQuantizedStep + kQuantizedStep * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST_F(DivTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 1};
+
+  std::vector<int32_t> output_shape = {2, 3, 1, 1};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f, 0.5f, 0.8f, 1.1f};
+  std::vector<float> input2_data{0.2f, 1.6f, 0.5f, 0.4f, 1.6f, 0.4f};
+  std::vector<float> test_outputs{1.5f, 1.4375f, 1.8f, 1.25f, 0.5f, 2.75f};
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input2_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST_F(DivTest, FloatBroadcast)
+{
+  Shape input1_shape = {1, 3};
+  Shape input2_shape = {3, 1};
+
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f};
+  std::vector<float> input2_data{0.2f, 1.6f, 0.5f};
+  std::vector<float> test_outputs{0.f, 11.5f, 4.5f, 0.f, 1.4375f, 0.5625f, 0.f, 4.6f, 1.8f};
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+}
+
+TEST_F(DivTest, Uint8)
+{
+  Shape base_shape = {1, 2, 2, 1};
+
+  std::vector<int32_t> output_shape = {1, 2, 2, 1};
+
+  std::vector<float> input1_data = {-0.8f, -0.2f, 0.3f, 0.7f};
+  std::vector<float> input2_data = {-0.8f, 0.4f, 0.8f, 1.0f};
+  std::vector<float> test_outputs{1.0f, 0.f, 0.375f, 0.7f};
+
+  const float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.f, 1.f);
+
+  Tensor input1_tensor = makeInputTensor<DataType::U8>(
+    base_shape, quant_param.first, quant_param.second, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U8>(
+    base_shape, quant_param.first, quant_param.second, input2_data, _memory_manager.get());
+
+  Tensor output_tensor =
+    makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(test_outputs, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <loco::DataType DType> void checkInteger(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+
+  std::vector<std::vector<dtype>> test_outputs = {{5,  6,  2, 0,  10, 3, //
+                                                   10, 0,  4, 5,  20, 0, //
+                                                   0,  0,  0, 2,  0,  0, //
+                                                   2,  0,  1, 10, 5,  0, //
+                                                   2,  3,  1, 0,  5,  1, //
+                                                   18, 20, 7, 0,  37, 10},
+                                                  {5, 6, 4, 5, 0, 0, 2, 0, 1, 0, 37, 10},
+                                                  {5, 7, 4, 6, 2, 3, 10, 0,  8,  0,  4, 0,
+                                                   0, 0, 0, 0, 0, 0, 0,  10, 5,  0,  1, 0,
+                                                   0, 0, 5, 9, 1, 1, 0,  0,  37, 50, 7, 10},
+                                                  {5, 7, 8, 0, 0, 0, 0, 10, 5, 9, 7, 10}};
+  std::vector<dtype> input1_data{20, 30, 40, -17, -4, -7, 11, -31, 10, 19, 75, 100};
+  std::vector<dtype> input2_data{4, 5, 10, -3, 2, 10};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    DivParams params{};
+    params.activation = Activation::RELU;
+
+    Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+}
+
+TEST_F(DivTest, SInt64)
+{
+  checkInteger<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(DivTest, SInt32)
+{
+  checkInteger<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(DivTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(DivTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U64);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(DivTest, Invalid_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  DivParams params{};
+  params.activation = Activation::RELU;
+
+  Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Elu.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.cpp
new file mode 100644
index 000000000..697d63be4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Elu.h"
+#include "kernels/Utils.h"
+
+#include "PALElu.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Elu::Elu(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Elu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void Elu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::Elu(getTensorShape(input()), getTensorData<float>(input()),
+                                getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Elu.h b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.h
new file mode 100644
index 000000000..c844ab57f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_ELU_H
+#define LUCI_INTERPRETER_KERNELS_ELU_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Elu : public Kernel
+{
+public:
+  Elu(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_ELU_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Elu.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.test.cpp
new file mode 100644
index 000000000..814499cdb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Elu.test.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Elu.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Elu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  (void)output_shape;
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+}
+
+TEST(EluTest, SimpleElu)
+{
+  Check(
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      0, -6, 2, -4,    //
+      3, -2, 10, -0.1, //
+    },
+    /*output_data=*/
+    {
+      0.0, -0.997521, 2.0, -0.981684,   //
+      3.0, -0.864665, 10.0, -0.0951626, //
+    });
+}
+
+TEST(EluTest, InOutTypeMismatch_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, -6, 2,  -4,   //
+    3, -2, 10, -0.1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Elu kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Equal.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.cpp
new file mode 100644
index 000000000..a57e127b7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Equal.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Equal::Equal(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Equal::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Equal::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Equal::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowEqual(op_params, getTensorShape(x()), x_data,
+                                                getTensorShape(y()), y_data,
+                                                getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Equal(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                 y_data, getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void Equal::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                                         getTensorShape(y()), y_data,
+                                                         getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::EqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                          getTensorShape(y()), y_data, getTensorShape(output()),
+                                          output_data);
+  }
+}
+
+void Equal::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                           getTensorShape(y()), y_data,
+                                                           getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::EqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                            getTensorShape(y()), y_data, getTensorShape(output()),
+                                            output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Equal.h b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.h
new file mode 100644
index 000000000..c9be32cc0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Equal : public Kernel
+{
+public:
+  Equal(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_EQUAL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Equal.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.test.cpp
new file mode 100644
index 000000000..5870e5460
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Equal.test.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Equal.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class EqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(EqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true, false, // Row 1
+    false, true, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(EqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+    0.9, 0.7, 0.5, // Row 4
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  false, // Row 1
+    false, false, false, // Row 2
+    false, false, false, // Row 3
+    true,  true,  true,  // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value, -2, max_value};
+
+  std::vector<bool> ref_output_data{true, false, true};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,         // Row 1
+    4,         5,  max_value, // Row 2
+    -1,        -2, -3,        // Row 3
+    min_value, -2, max_value, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value, -2, max_value, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, false, // Row 1
+    false, false, true,  // Row 2
+    false, true,  false, // Row 3
+    true,  true,  true,  // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(EqualTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(EqualTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(EqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.5, 0.55, 0.5, // Row 1
+    -1,  0,   0.05, 1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true, false, false, // Row 1
+    false, true, true,  false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(EqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+    -1,   0.05, 0,    1,   // Row 4
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, false, false, false, // Row 1
+    false, false, true,  false, // Row 2
+    false, false, false, false, // Row 3
+    true,  true,  true,  true,  // Row 4
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 4, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(EqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(EqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(EqualTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(EqualTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(EqualTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Equal kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Exp.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.cpp
new file mode 100644
index 000000000..e7c560a88
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Exp.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/exp.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Exp::Exp(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Exp::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void Exp::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Exp::evalFloat() const
+{
+  const int size = tflite::MatchingFlatSize(getTensorShape(input()), getTensorShape(output()));
+  tflite::reference_ops::Exp(getTensorData<float>(input()), size, getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Exp.h b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.h
new file mode 100644
index 000000000..429177375
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_EXP_H
+#define LUCI_INTERPRETER_KERNELS_EXP_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Exp : public Kernel
+{
+public:
+  Exp(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_EXP_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Exp.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.test.cpp
new file mode 100644
index 000000000..a159d9db9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Exp.test.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Exp.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(ExpTest, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Shape input_shape{1, 1, 7};
+  std::vector<float> input_data{0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Exp kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<int32_t> ref_output_shape{1, 1, 7};
+  std::vector<float> ref_output_data{std::exp(0.0f),   std::exp(1.0f),    std::exp(-1.0f),
+                                     std::exp(100.0f), std::exp(-100.0f), std::exp(0.01f),
+                                     std::exp(-0.01f)};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.cpp
new file mode 100644
index 000000000..ba35c99fa
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ExpandDims.h"
+#include "kernels/Utils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ExpandDims::ExpandDims(const Tensor *input, const Tensor *axis, Tensor *output)
+  : Kernel({input, axis}, {output})
+{
+}
+
+void ExpandDims::configure()
+{
+  int32_t axis_value;
+
+  switch (axis()->element_type())
+  {
+    case loco::DataType::S32:
+      axis_value = *getTensorData<int32_t>(axis());
+      break;
+    case loco::DataType::S64:
+      axis_value = static_cast<int32_t>(*getTensorData<int64_t>(axis()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+
+  const auto input_shape = input()->shape();
+
+  if (axis_value < 0)
+  {
+    axis_value += input_shape.num_dims() + 1;
+  }
+
+  LUCI_INTERPRETER_CHECK(axis_value <= input_shape.num_dims() and axis_value >= 0);
+
+  Shape output_shape(input_shape.num_dims() + 1);
+  for (int32_t i = 0; i < output_shape.num_dims(); ++i)
+  {
+    if (i < axis_value)
+    {
+      output_shape.dim(i) = input_shape.dim(i);
+    }
+    else if (i == axis_value)
+    {
+      output_shape.dim(i) = 1;
+    }
+    else
+    {
+      LUCI_INTERPRETER_CHECK(i >= 1);
+      output_shape.dim(i) = input_shape.dim(i - 1);
+    }
+  }
+
+  output()->resize(output_shape);
+}
+
+void ExpandDims::execute() const
+{
+  // Just copy input to output
+  const auto *input_data = input()->data<void>();
+  auto *output_data = output()->data<void>();
+
+  const size_t element_size = getDataTypeSize(input()->element_type());
+  const int32_t num_elements = input()->shape().num_elements();
+  std::memcpy(output_data, input_data, num_elements * element_size);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.h b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.h
new file mode 100644
index 000000000..e510b1160
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H
+#define LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ExpandDims : public Kernel
+{
+public:
+  ExpandDims(const Tensor *input, const Tensor *axis, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axis() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.test.cpp
new file mode 100644
index 000000000..df9eaccc0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ExpandDims.test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ExpandDims.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ExpandDimsTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ExpandDimsTest, PositiveAxis)
+{
+  std::vector<int32_t> input_data{-1, 1, -2, 2};
+  std::initializer_list<int32_t> input_shape = {2, 2};
+
+  std::initializer_list<int32_t> axis_value = {0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(input_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2}));
+}
+
+TEST_F(ExpandDimsTest, NegAxis)
+{
+  std::vector<int32_t> input_data{-1, 1, -2, 2};
+  std::initializer_list<int32_t> input_shape = {2, 2};
+
+  std::initializer_list<int32_t> axis_value = {-1};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(input_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 2, 1}));
+}
+
+TEST_F(ExpandDimsTest, InvalidAxisType_NEG)
+{
+  std::vector<int32_t> input_data{-1, 1, -2, 2};
+  std::initializer_list<int32_t> input_shape = {2, 2};
+
+  std::initializer_list<float> axis_value = {1.0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::FLOAT32>({1}, axis_value, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ExpandDimsTest, InvalidAxisValue_NEG)
+{
+  std::vector<int32_t> input_data{-1, 1, -2, 2};
+  std::initializer_list<int32_t> input_shape = {2, 2};
+
+  std::initializer_list<int32_t> axis_value = {3};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Fill.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.cpp
new file mode 100644
index 000000000..e09d6331a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/Utils.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Fill::Fill(const Tensor *dims, const Tensor *value, Tensor *output)
+  : Kernel({dims, value}, {output})
+{
+}
+
+template <typename T> void Fill::configureShape()
+{
+  const auto dims_data = getTensorData<T>(dims());
+  Shape output_shape(dims()->shape().dim(0));
+
+  for (int i = 0; i < output_shape.num_dims(); ++i)
+  {
+    T data = dims_data[i];
+    if (data < 0)
+      throw std::runtime_error("Fill dimensions must be >= 0");
+
+    output_shape.dim(i) = data;
+  }
+
+  output()->resize(output_shape);
+}
+
+void Fill::configure()
+{
+  const auto dims_shape = dims()->shape();
+  const auto value_shape = value()->shape();
+
+  // Make sure the 1st input tensor is 1-D
+  LUCI_INTERPRETER_CHECK(dims_shape.num_dims() == 1);
+
+  // Make sure the 1st input tensor is int32 or int64
+  LUCI_INTERPRETER_CHECK(dims()->element_type() == DataType::S32 or
+                         dims()->element_type() == DataType::S64);
+
+  // Make sure the 2nd input tensor is a scalar
+  LUCI_INTERPRETER_CHECK(value_shape.num_dims() == 0)
+
+  // Check zero point and scale for S16 and S8
+  if (value()->element_type() == loco::DataType::S16 or
+      value()->element_type() == loco::DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(value()->scale() == output()->scale());
+    LUCI_INTERPRETER_CHECK(value()->zero_point() == output()->zero_point());
+
+    if (value()->element_type() == loco::DataType::S16)
+      LUCI_INTERPRETER_CHECK(value()->zero_point() == 0);
+  }
+  // Resize output
+  switch (dims()->element_type())
+  {
+    case DataType::S32:
+      configureShape<int32_t>();
+      break;
+    case DataType::S64:
+      configureShape<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Fill::execute() const
+{
+  switch (output()->element_type())
+  {
+    case DataType::S8:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int8_t>(value()),
+                                  getTensorShape(output()), getTensorData<int8_t>(output()));
+      break;
+    case DataType::S16:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int16_t>(value()),
+                                  getTensorShape(output()), getTensorData<int16_t>(output()));
+      break;
+    case DataType::S32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int32_t>(value()),
+                                  getTensorShape(output()), getTensorData<int32_t>(output()));
+      break;
+    case DataType::S64:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int64_t>(value()),
+                                  getTensorShape(output()), getTensorData<int64_t>(output()));
+      break;
+    case DataType::FLOAT32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<float>(value()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Fill.h b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.h
new file mode 100644
index 000000000..184f0cb83
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FILL_H
+#define LUCI_INTERPRETER_KERNELS_FILL_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Fill : public Kernel
+{
+public:
+  Fill(const Tensor *dims, const Tensor *value, Tensor *output);
+
+  const Tensor *dims() const { return _inputs[0]; }
+  const Tensor *value() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void configureShape();
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FILL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Fill.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.test.cpp
new file mode 100644
index 000000000..cf56df507
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Fill.test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class FillTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T, DataType DT> void runFillIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<T> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+template <DataType DT> void runFillQuantIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<float> value_data = {5};
+
+  int32_t zero_point = 0;
+
+  if (DT == loco::DataType::S8)
+    zero_point = 1;
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, /*scale*/ 0.25, /*zero_point*/ zero_point,
+                                     value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT, /*scale*/ 0.25, /*zero_point*/ zero_point);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, FillInt)
+{
+  // Run for int32_t input
+  runFillIntKernel<int32_t, loco::DataType::S32>(_memory_manager.get());
+  // Run for int64_t input
+  runFillIntKernel<int64_t, loco::DataType::S64>(_memory_manager.get());
+  // Run for int8_t input
+  runFillQuantIntKernel<loco::DataType::S8>(_memory_manager.get());
+  // Run for int16_t input
+  runFillQuantIntKernel<loco::DataType::S16>(_memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(FillTest, FillFloat)
+{
+  Shape dims_shape{3};
+
+  std::vector<int64_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S64>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5, 5, 5};
+
+  std::vector<int32_t> ref_output_shape{2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), ref_output_data);
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, Invalid_Input_Shape_NEG)
+{
+  Shape dims_shape{1, 3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(FillTest, Invalid_Value_Shape_NEG)
+{
+  Shape dims_shape{3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value = makeInputTensor<loco::DataType::FLOAT32>({1}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Floor.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.cpp
new file mode 100644
index 000000000..e3c4246cc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Floor.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/floor.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Floor::Floor(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Floor::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void Floor::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Floor::evalFloat() const
+{
+  tflite::reference_ops::Floor(getTensorShape(input()), getTensorData<float>(input()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Floor.h b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.h
new file mode 100644
index 000000000..ca3ad5997
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FLOOR_H
+#define LUCI_INTERPRETER_KERNELS_FLOOR_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Floor : public Kernel
+{
+public:
+  Floor(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FLOOR_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Floor.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.test.cpp
new file mode 100644
index 000000000..30076fb54
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Floor.test.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Floor.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class FloorTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(FloorTest, SimpleFloat)
+{
+  std::initializer_list<int32_t> input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0.2, 8.6, 2.4,  4.3,  // Row 1
+    3,   7.1, 10.5, -0.9, // Row 2
+  };
+
+  std::initializer_list<int32_t> ref_output_shape{1, 2, 4, 1};
+  std::vector<float> ref_output_data{
+    0, 8, 2,  4,  // Row 1
+    3, 7, 10, -1, // Row 2
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Floor kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FloorTest, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Floor kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.cpp
new file mode 100644
index 000000000..a7a10a336
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FloorDiv.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/binary_function.h>
+#include <cmath>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+FloorDiv::FloorDiv(const Tensor *input, const Tensor *alpha, Tensor *output)
+  : Kernel({input, alpha}, {output})
+{
+}
+
+void FloorDiv::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(y()->element_type() == output()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void FloorDiv::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void FloorDiv::evalFloat() const
+{
+  auto FloorDivFunc = [](float x, float y) -> float {
+    return std::floor(static_cast<double>(x) / static_cast<double>(y));
+  };
+
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+
+  // Check the denominator
+  for (int i = 0; i < getTensorShape(y()).FlatSize(); ++i)
+  {
+    LUCI_INTERPRETER_CHECK(y_data[i] != 0);
+  }
+
+  if (x()->shape() != y()->shape())
+  {
+    tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+      getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      getTensorData<float>(output()), FloorDivFunc);
+  }
+  else
+  {
+    tflite::reference_ops::BinaryFunction<float, float, float>(
+      getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      getTensorData<float>(output()), FloorDivFunc);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.h b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.h
new file mode 100644
index 000000000..e9c47d81a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
+#define LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class FloorDiv : public Kernel
+{
+public:
+  FloorDiv(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FLOOR_DIV_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp
new file mode 100644
index 000000000..3e1b5f18e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FloorDiv.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class FloorDivTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(FloorDivTest, FloatSimple)
+{
+  Shape x_shape{2, 3};
+  std::vector<float> x_data{
+    0.5, 2.4,  3.1,  // Row 1
+    1.9, -1.9, -2.8, // Row 2
+  };
+
+  Shape y_shape = x_shape;
+  std::vector<float> y_data{
+    2.0, 0.5,  3.0,  // Row 1
+    1.0, -1.0, -2.0, // Row 2
+  };
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  std::vector<float> ref_output_data{
+    0, 4, 1, // Row 1
+    1, 1, 1, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FloorDivTest, FloatBroadcast)
+{
+  Shape x_shape{1, 3};
+  std::vector<float> x_data{
+    0.5, 2.4, -3.1, // Row 1
+  };
+
+  Shape y_shape{3, 3};
+  std::vector<float> y_data{
+    1.0, 1.0,  1.0,  // Row 1
+    2.0, -0.5, -2.0, // Row 2
+    0.3, 0.7,  0.9,  // Row 3
+  };
+
+  std::vector<int32_t> ref_output_shape{3, 3};
+  std::vector<float> ref_output_data{
+    0, 2,  -4, // Row 1
+    0, -5, 1,  // Row 2
+    1, 3,  -4, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FloorDivTest, DivByZero_NEG)
+{
+  Shape shape{3};
+  std::vector<float> x_data{1, 0, -1};
+  std::vector<float> y_data{0, 0, 0};
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(shape, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(FloorDivTest, Input_Output_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(FloorDivTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.cpp
new file mode 100644
index 000000000..bd2bb2f35
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FullyConnected.h"
+
+#include "kernels/Utils.h"
+
+#include "PALFullyConnected.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+FullyConnected::FullyConnected(const Tensor *input, const Tensor *weights, const Tensor *bias,
+                               Tensor *output, const FullyConnectedParams &params)
+  : KernelWithParams<FullyConnectedParams>({input, weights, bias}, {output}, params)
+{
+}
+
+void FullyConnected::configure()
+{
+  if (weights()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::U8);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::U8);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::S32)
+  }
+  else if (weights()->element_type() == DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
+  }
+  else if (weights()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::S8);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::S8);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::S32)
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+
+  const Shape &input_shape = input()->shape();
+  const Shape &weights_shape = weights()->shape();
+
+  LUCI_INTERPRETER_CHECK(weights_shape.num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(bias() == nullptr ||
+                         bias()->shape().num_elements() == weights_shape.dim(0));
+
+  LUCI_INTERPRETER_CHECK(input_shape.num_elements() % weights_shape.dim(1) == 0);
+  const int32_t batch_size = input_shape.num_elements() / weights_shape.dim(1);
+  const int32_t num_units = weights_shape.dim(0);
+
+  if (bias())
+    LUCI_INTERPRETER_CHECK(bias()->shape().num_elements() == weights()->shape().dim(0));
+
+  if (params().keep_num_dims == false)
+  {
+    output()->resize({batch_size, num_units});
+  }
+  else
+  {
+    luci_interpreter::Shape output_shape(input_shape.num_dims());
+    for (int i = 0; i < input_shape.num_dims(); ++i)
+      output_shape.dim(i) = input_shape.dim(i);
+    output_shape.dim(input_shape.num_dims() - 1) = num_units;
+    output()->resize(output_shape);
+  }
+}
+
+void FullyConnected::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S8:
+      evalQuantizedS8();
+      break;
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void FullyConnected::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::FullyConnectedParams params{};
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+  params.weights_format = tflite::FullyConnectedWeightsFormat::kDefault;
+
+  tflite::reference_ops::FullyConnected(
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(weights()),
+    getTensorData<float>(weights()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void FullyConnected::evalQuantized() const
+{
+  double real_multiplier = 0.0;
+  int output_shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  int32_t output_multiplier;
+  real_multiplier =
+    getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+  calculateActivationRangeQuantized(params().activation, output(), &output_activation_min,
+                                    &output_activation_max);
+
+  int32_t input_offset = -input()->zero_point();
+  int32_t filter_offset = -weights()->zero_point();
+  int32_t output_offset = output()->zero_point();
+
+  tflite::FullyConnectedParams op_params{};
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.lhs_cacheable = false;
+  op_params.rhs_cacheable = false;
+  tflite::reference_ops::FullyConnected(
+    op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(weights()),
+    getTensorData<uint8_t>(weights()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+void FullyConnected::evalQuantizedS8() const
+{
+  double real_multiplier = 0.0;
+  int output_shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  int32_t output_multiplier;
+  real_multiplier =
+    getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+  calculateActivationRangeQuantized(params().activation, output(), &output_activation_min,
+                                    &output_activation_max);
+
+  int32_t input_offset = -input()->zero_point();
+  int32_t filter_offset = -weights()->zero_point();
+  int32_t output_offset = output()->zero_point();
+
+  tflite::FullyConnectedParams op_params{};
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.lhs_cacheable = false;
+  op_params.rhs_cacheable = false;
+  luci_interpreter_pal::FullyConnected<int8_t>(
+    op_params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(weights()),
+    getTensorData<int8_t>(weights()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<int8_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.h b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.h
new file mode 100644
index 000000000..2a7c068c0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FULLYCONNECTED_H
+#define LUCI_INTERPRETER_KERNELS_FULLYCONNECTED_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class FullyConnected : public KernelWithParams<FullyConnectedParams>
+{
+public:
+  FullyConnected(const Tensor *input, const Tensor *weights, const Tensor *bias, Tensor *output,
+                 const FullyConnectedParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *weights() const { return _inputs[1]; }
+  const Tensor *bias() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS8() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FULLYCONNECTED_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.test.cpp
new file mode 100644
index 000000000..4474cc4fb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/FullyConnected.test.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/FullyConnected.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
+           std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
+           std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<int8_t>(std::initializer_list<int32_t> input_shape,
+                   std::initializer_list<int32_t> weights_shape,
+                   std::initializer_list<int32_t> bias_shape,
+                   std::initializer_list<int32_t> output_shape,
+                   std::initializer_list<float> input_data,
+                   std::initializer_list<float> weights_data,
+                   std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  const float quantized_tolerance = getTolerance(-127, 128, 255);
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::S8>(weights_shape, input_quant_param.first, input_quant_param.second,
+                                  weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S32>(bias_shape, input_quant_param.first * input_quant_param.first, 0,
+                                   bias_data, memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <>
+void Check<uint8_t>(
+  std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
+  std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+  std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
+  std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  const float quantized_tolerance = getTolerance(-127, 128, 255);
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::U8>(weights_shape, input_quant_param.first, input_quant_param.second,
+                                  weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S32>(bias_shape, input_quant_param.first * input_quant_param.first, 0,
+                                   bias_data, memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <typename T> class FullyConnectedTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int8_t>;
+TYPED_TEST_SUITE(FullyConnectedTest, DataTypes);
+
+TYPED_TEST(FullyConnectedTest, Simple)
+{
+  Check<TypeParam>({3, 2, 2, 1}, {3, 6}, {3}, {2, 3},
+                   {
+                     -3, -5, 5, 4, 9, -2,  // batch = 0
+                     -3, -2, -4, 9, -8, 1, // batch = 1
+                   },
+                   {
+                     -3, -7, 4, -4, -6, 4, // unit = 0
+                     3, 5, 2, 3, -3, -8,   // unit = 1
+                     -3, 7, 4, 9, 0, -5,   // unit = 2
+                   },
+                   {-1, -5, -8},
+                   {
+                     0, 0, 32,   // batch = 0
+                     22, 11, 47, // batch = 1
+                   });
+}
+
+TEST(FullyConnectedTest, InvalidBiasType_NEG)
+{
+  Shape input_shape{3, 2, 2, 1};
+  std::vector<float> input_data{
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
+  };
+  Shape weights_shape{3, 6};
+  std::vector<float> weights_data{
+    -3, -7, 4, -4, -6, 4,  // unit = 0
+    3,  5,  2, 3,  -3, -8, // unit = 1
+    -3, 7,  4, 9,  0,  -5, // unit = 2
+  };
+  Shape bias_shape{3};
+  std::vector<int32_t> bias_data{-1, -5, -8};
+
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(FullyConnectedTest, InvalidWeightShapeDim_NEG)
+{
+  Shape input_shape{3, 2, 2, 1};
+  std::vector<float> input_data{
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
+  };
+  Shape weights_shape{1, 3, 6};
+  std::vector<float> weights_data{
+    -3, -7, 4, -4, -6, 4,  // unit = 0
+    3,  5,  2, 3,  -3, -8, // unit = 1
+    -3, 7,  4, 9,  0,  -5, // unit = 2
+  };
+  Shape bias_shape{3};
+  std::vector<float> bias_data{-1, -5, -8};
+
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(FullyConnectedTest, BiasElementNumWeightDimMismatch_NEG)
+{
+  Shape input_shape{3, 2, 2, 1};
+  std::vector<float> input_data{
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
+  };
+  Shape weights_shape{6, 3};
+  std::vector<float> weights_data{
+    -3, -7, 4,  // unit = 0
+    -4, -6, 4,  // unit = 1
+    3,  5,  2,  // unit = 2
+    3,  -3, -8, // unit = 3
+    -3, 7,  4,  // unit = 4
+    9,  0,  -5, // unit = 5
+  };
+  Shape bias_shape{3};
+  std::vector<float> bias_data{-1, -5, -8};
+
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Gather.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.cpp
new file mode 100644
index 000000000..f1256660f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Gather.h"
+#include "kernels/Utils.h"
+#include "PALGather.h"
+
+#include <stdexcept>
+#include <cassert>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Gather::Gather(const Tensor *params, const Tensor *indices, Tensor *output,
+               const GatherParams &gparams)
+  : KernelWithParams<GatherParams>({params, indices}, {output}, gparams)
+{
+}
+
+void Gather::configure()
+{
+  if (params()->element_type() == DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+
+  LUCI_INTERPRETER_CHECK(indices()->element_type() == DataType::S32 ||
+                         indices()->element_type() == DataType::S64);
+
+  // refer tensorflow/lite/kernels/gather.cc
+
+  const Shape &params_shape = params()->shape();
+  const Shape &indices_shape = indices()->shape();
+
+  int axis = _params.axis;
+  if (axis < 0)
+  {
+    axis += params_shape.num_dims();
+  }
+  LUCI_INTERPRETER_CHECK(0 <= axis && axis < params_shape.num_dims());
+
+  int batch_dims = _params.batch_dims;
+  // batch_dims should be in range: [-rank(indices), rank(indices)].
+  // Negative batch_dims is added with rank of positions.
+  if (batch_dims < 0)
+  {
+    batch_dims += indices_shape.num_dims();
+  }
+  LUCI_INTERPRETER_CHECK(batch_dims <= axis);
+  LUCI_INTERPRETER_CHECK(0 <= batch_dims && batch_dims < params_shape.num_dims());
+  LUCI_INTERPRETER_CHECK(batch_dims <= indices_shape.num_dims());
+  for (int i = 0; i < batch_dims; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(params_shape.dim(i) == indices_shape.dim(i));
+  }
+
+  const int num_dimensions = params_shape.num_dims() + indices_shape.num_dims() - 1 - batch_dims;
+
+  Shape output_shape(num_dimensions);
+  int output_index = 0;
+  for (int i = 0; i < axis; ++i)
+  {
+    output_shape.dim(output_index++) = params_shape.dim(i);
+  }
+  for (int i = batch_dims; i < indices_shape.num_dims(); ++i)
+  {
+    output_shape.dim(output_index++) = indices_shape.dim(i);
+  }
+  for (int i = axis + 1; i < params_shape.num_dims(); ++i)
+  {
+    output_shape.dim(output_index++) = params_shape.dim(i);
+  }
+  output()->resize(output_shape);
+}
+
+void Gather::execute() const
+{
+  switch (params()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Gather::evalFloat() const
+{
+  assert(indices()->element_type() == DataType::S32 || indices()->element_type() == DataType::S64);
+
+  const auto params_data = getTensorData<float>(params());
+  auto output_data = getTensorData<float>(output());
+
+  tflite::GatherParams tparams;
+  tparams.axis = _params.axis;
+  tparams.batch_dims = _params.batch_dims;
+
+  if (indices()->element_type() == DataType::S32)
+  {
+    const auto indices_data = getTensorData<int32_t>(indices());
+
+    luci_interpreter_pal::Gather<float, int32_t>(tparams, getTensorShape(params()), params_data,
+                                                 getTensorShape(indices()), indices_data,
+                                                 getTensorShape(output()), output_data);
+  }
+  else
+  {
+    const auto indices_data = getTensorData<int64_t>(indices());
+
+    luci_interpreter_pal::Gather<float, int64_t>(tparams, getTensorShape(params()), params_data,
+                                                 getTensorShape(indices()), indices_data,
+                                                 getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Gather.h b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.h
new file mode 100644
index 000000000..cc02d64fb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GATHER_H
+#define LUCI_INTERPRETER_KERNELS_GATHER_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Gather : public KernelWithParams<GatherParams>
+{
+public:
+  Gather(const Tensor *params, const Tensor *indices, Tensor *output, const GatherParams &gparams);
+
+  const Tensor *params() const { return _inputs[0]; }
+  const Tensor *indices() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_GATHER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Gather.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.test.cpp
new file mode 100644
index 000000000..4b3dda708
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Gather.test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Gather.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class GatherTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GatherTest, Simple)
+{
+  std::vector<float> params_data{1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  std::vector<int32_t> indices_data{1, 0, 1, 5};
+  std::vector<float> ref_output_data{2.f, 1.f, 2.f, 6.f};
+
+  Tensor params_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 6}, params_data, _memory_manager.get());
+  Tensor indices_tensor = makeInputTensor<DataType::S32>({4}, indices_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  GatherParams gparams;
+
+  gparams.axis = 1;
+  gparams.batch_dims = 0;
+
+  Gather kernel(&params_tensor, &indices_tensor, &output_tensor, gparams);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4}));
+}
+
+TEST_F(GatherTest, Simple_Batch)
+{
+  Shape params_shape = {3, 5};
+  Shape indices_shape = {3, 2};
+  std::vector<float> params_data{0., 0., 1., 0., 2., 3., 0., 0., 0., 4., 0., 5., 0., 6., 0.};
+  std::vector<int32_t> indices_data{2, 4, 0, 4, 1, 3};
+  std::vector<float> ref_output_data{1., 2., 3., 4., 5., 6.};
+
+  Tensor params_tensor =
+    makeInputTensor<DataType::FLOAT32>(params_shape, params_data, _memory_manager.get());
+  Tensor indices_tensor =
+    makeInputTensor<DataType::S32>(indices_shape, indices_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  GatherParams gparams;
+
+  gparams.axis = 1;
+  gparams.batch_dims = 1;
+
+  Gather kernel(&params_tensor, &indices_tensor, &output_tensor, gparams);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 2}));
+}
+
+TEST_F(GatherTest, Simple_NEG)
+{
+  Tensor params_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get());
+  Tensor indices_tensor = makeInputTensor<DataType::S32>({1}, {0}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  GatherParams gparams;
+
+  Gather kernel(&params_tensor, &indices_tensor, &output_tensor, gparams);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GatherTest, Axis_NEG)
+{
+  Tensor params_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor indices_tensor = makeInputTensor<DataType::S32>({1}, {0}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  GatherParams gparams;
+
+  gparams.axis = 100;
+  gparams.batch_dims = 0;
+
+  Gather kernel(&params_tensor, &indices_tensor, &output_tensor, gparams);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GatherTest, Batch_NEG)
+{
+  std::vector<float> params_data{1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  std::vector<int32_t> indices_data{1, 0, 1, 5};
+  std::vector<float> ref_output_data{2.f, 1.f, 2.f, 6.f};
+
+  Tensor params_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 6}, params_data, _memory_manager.get());
+  Tensor indices_tensor = makeInputTensor<DataType::S32>({4}, indices_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  GatherParams gparams;
+
+  gparams.axis = 0;
+  gparams.batch_dims = 1;
+
+  Gather kernel(&params_tensor, &indices_tensor, &output_tensor, gparams);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Greater.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.cpp
new file mode 100644
index 000000000..5ccae3c38
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Greater.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Greater::Greater(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Greater::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Greater::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Greater::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreater(op_params, getTensorShape(x()), x_data,
+                                                  getTensorShape(y()), y_data,
+                                                  getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Greater(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                   y_data, getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void Greater::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterNoScaling(op_params, getTensorShape(x()), x_data,
+                                                           getTensorShape(y()), y_data,
+                                                           getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterNoScaling(op_params, getTensorShape(x()), x_data,
+                                            getTensorShape(y()), y_data, getTensorShape(output()),
+                                            output_data);
+  }
+}
+
+void Greater::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterWithScaling(op_params, getTensorShape(x()), x_data,
+                                                             getTensorShape(y()), y_data,
+                                                             getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterWithScaling(op_params, getTensorShape(x()), x_data,
+                                              getTensorShape(y()), y_data, getTensorShape(output()),
+                                              output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Greater.h b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.h
new file mode 100644
index 000000000..065f76d7b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GREATER_H
+#define LUCI_INTERPRETER_KERNELS_GREATER_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Greater : public Kernel
+{
+public:
+  Greater(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_GREATER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Greater.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.test.cpp
new file mode 100644
index 000000000..a48080124
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Greater.test.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Greater.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class GreaterTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GreaterTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, false, true,  // Row 1
+    true,  false, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(GreaterTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, false, true,  // Row 1
+    true,  false, false, // Row 2
+    false, false, true,  // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value + 1, -2, max_value};
+
+  std::vector<bool> ref_output_data{false, true, false};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,         // Row 1
+    4,         5,  max_value, // Row 2
+    -1,        -4, -3,        // Row 3
+    min_value, -2, max_value, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value + 1, -2, max_value - 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  false, // Row 1
+    true,  true,  true,  // Row 2
+    true,  false, false, // Row 3
+    false, false, true,  // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(GreaterTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(GreaterTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(GreaterTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, false, true, true,  // Row 1
+    true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, false, true, true,  // Row 1
+    true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 3);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true, false, true,  false, // Row 1
+    true, true,  false, false, // Row 2
+    true, false, true,  false, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Greater kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.cpp
new file mode 100644
index 000000000..27e42c971
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GreaterEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+GreaterEqual::GreaterEqual(const Tensor *x, const Tensor *y, Tensor *output)
+  : Kernel({x, y}, {output})
+{
+}
+
+void GreaterEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void GreaterEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void GreaterEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterEqual(op_params, getTensorShape(x()), x_data,
+                                                       getTensorShape(y()), y_data,
+                                                       getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                        y_data, getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void GreaterEqual::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                                 getTensorShape(y()), y_data,
+                                                 getTensorShape(output()), output_data);
+  }
+}
+
+void GreaterEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
+  }
+  else
+  {
+    tflite::reference_ops::GreaterEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                   getTensorShape(y()), y_data,
+                                                   getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.h b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.h
new file mode 100644
index 000000000..e333c30a6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class GreaterEqual : public Kernel
+{
+public:
+  GreaterEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_GREATER_EQUAL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.test.cpp
new file mode 100644
index 000000000..35bf88eab
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/GreaterEqual.test.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GreaterEqual.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class GreaterEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GreaterEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true, true,  // Row 1
+    true,  true, false, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(GreaterEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  true,  // Row 1
+    true,  false, false, // Row 2
+    false, false, true,  // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value + 1, -2, max_value};
+
+  std::vector<bool> ref_output_data{false, true, true};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,             // Row 1
+    4,         5,  max_value,     // Row 2
+    -1,        -4, -3,            // Row 3
+    min_value, -2, max_value - 1, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value + 1, -2, max_value - 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  false, // Row 1
+    true,  true,  true,  // Row 2
+    true,  false, false, // Row 3
+    false, true,  true,  // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(GreaterEqualTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(GreaterEqualTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(GreaterEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  true, true,  // Row 1
+    true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterEqualTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.5,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  true, true,  // Row 1
+    true,  false, true, false, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true, false, true, false, // Row 1
+    true, true,  true, false, // Row 2
+    true, false, true, false, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GreaterEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterEqualTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterEqualTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GreaterEqualTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/If.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/If.cpp
new file mode 100644
index 000000000..971708bca
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/If.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/If.h"
+#include "kernels/Utils.h"
+
+#include <cstring>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+static std::vector<const Tensor *> joinInputs(const Tensor *cond,
+                                              const std::vector<const Tensor *> &inputs)
+{
+  std::vector<const Tensor *> result{cond};
+  result.insert(result.cend(), inputs.cbegin(), inputs.cend());
+  return result;
+}
+
+If::If(const Tensor *cond, const std::vector<const Tensor *> &inputs, std::vector<Tensor *> outputs,
+       RuntimeGraph *then_graph, RuntimeGraph *else_graph)
+  : Kernel(joinInputs(cond, inputs), std::move(outputs)), _then_graph(then_graph),
+    _else_graph(else_graph)
+{
+}
+
+void If::configure()
+{
+  LUCI_INTERPRETER_CHECK(cond()->element_type() == DataType::BOOL);
+  LUCI_INTERPRETER_CHECK(cond()->shape().num_elements() == 1);
+
+  for (RuntimeGraph *graph : {_then_graph, _else_graph})
+  {
+    (void)graph;
+    LUCI_INTERPRETER_CHECK(graph->getInputTensors().size() == getInputTensors().size() - 1);
+    LUCI_INTERPRETER_CHECK(graph->getOutputTensors().size() == getOutputTensors().size());
+  }
+}
+
+void If::execute() const
+{
+  const bool cond_value = cond()->data<bool>()[0];
+
+  RuntimeGraph *active_graph = cond_value ? _then_graph : _else_graph;
+  const auto &graph_inputs = active_graph->getInputTensors();
+  const auto &graph_outputs = active_graph->getOutputTensors();
+
+  // Copy kernel inputs to active graph inputs.
+  for (size_t i = 0; i < getInputTensors().size() - 1; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(graph_inputs[i]->element_type() == input(i)->element_type());
+    graph_inputs[i]->resize(input(i)->shape());
+
+    const int32_t num_elements = input(i)->shape().num_elements();
+    const std::size_t element_size = getDataTypeSize(input(i)->element_type());
+    // TODO: Think about how allocate memory for output in main graph
+    active_graph->configureAllocations(graph_inputs[i]);
+    std::memcpy(graph_inputs[i]->data<void>(), input(i)->data<void>(), num_elements * element_size);
+  }
+
+  active_graph->execute();
+
+  // Copy graph outputs to kernel outputs.
+  for (size_t i = 0; i < getOutputTensors().size(); ++i)
+  {
+    LUCI_INTERPRETER_CHECK(graph_outputs[i]->element_type() == output(i)->element_type());
+    output(i)->resize(graph_outputs[i]->shape());
+    // TODO: Think about how allocate memory for output in main graph
+    active_graph->configureAllocations(output(i));
+
+    const int32_t num_elements = output(i)->shape().num_elements();
+    const std::size_t element_size = getDataTypeSize(output(i)->element_type());
+    std::memcpy(output(i)->data<void>(), graph_outputs[i]->data<void>(),
+                num_elements * element_size);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/If.h b/compiler/luci-micro/luci-interpreter/src/kernels/If.h
new file mode 100644
index 000000000..fa6ab371a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/If.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_IF_H
+#define LUCI_INTERPRETER_KERNELS_IF_H
+
+#include "core/Kernel.h"
+#include "core/RuntimeGraph.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class If : public Kernel
+{
+public:
+  If(const Tensor *cond, const std::vector<const Tensor *> &inputs, std::vector<Tensor *> outputs,
+     RuntimeGraph *then_graph, RuntimeGraph *else_graph);
+
+  const Tensor *cond() const { return _inputs[0]; }
+  const Tensor *input(int index) const { return _inputs[1 + index]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  RuntimeGraph *const _then_graph;
+  RuntimeGraph *const _else_graph;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_IF_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/If.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/If.test.cpp
new file mode 100644
index 000000000..c5f4faf75
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/If.test.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "core/RuntimeModule.h"
+#include "kernels/Add.h"
+#include "kernels/If.h"
+#include "kernels/Mul.h"
+#include "kernels/TestUtils.h"
+
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class IfTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+RuntimeGraph *buildAddSubgraph(RuntimeModule *module, IMemoryManager *memory_manager)
+{
+  RuntimeGraph *graph = module->addGraph(memory_manager);
+  Tensor *input1 = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+  Tensor *input2 = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+  Tensor *output = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+
+  memory_manager->allocate_memory(*input1);
+  memory_manager->allocate_memory(*input2);
+  memory_manager->allocate_memory(*output);
+
+  graph->setInputTensors({input1, input2});
+  graph->setOutputTensors({output});
+
+  AddParams params{};
+  params.activation = Activation::NONE;
+  graph->addKernel(std::make_unique<Add>(input1, input2, output, params));
+
+  return graph;
+}
+
+RuntimeGraph *buildMulSubgraph(RuntimeModule *module, IMemoryManager *memory_manager)
+{
+  RuntimeGraph *graph = module->addGraph(memory_manager);
+  Tensor *input1 = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+  Tensor *input2 = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+  Tensor *output = graph->addTensor(
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+
+  memory_manager->allocate_memory(*input1);
+  memory_manager->allocate_memory(*input2);
+  memory_manager->allocate_memory(*output);
+
+  graph->setInputTensors({input1, input2});
+  graph->setOutputTensors({output});
+
+  MulParams params{};
+  params.activation = Activation::NONE;
+  graph->addKernel(std::make_unique<Mul>(input1, input2, output, params));
+
+  return graph;
+}
+
+TEST_F(IfTest, CondTrue)
+{
+  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {true}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  kernel.configure();
+  _memory_manager->allocate_memory(output);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({6, 9}));
+}
+
+TEST_F(IfTest, CondFalse)
+{
+  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {false}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  kernel.configure();
+  _memory_manager->allocate_memory(output);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({5, 14}));
+}
+
+TEST_F(IfTest, InvalidCondType_NEG)
+{
+  Tensor cond = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(IfTest, InvalidCondElementNum_NEG)
+{
+  Tensor cond = makeInputTensor<DataType::BOOL>({2}, {false, true}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
+
+  If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.cpp
new file mode 100644
index 000000000..22a329be6
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/InstanceNorm.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/common.h>
+#include <cmath>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+InstanceNorm::InstanceNorm(const Tensor *input, const Tensor *gamma, const Tensor *beta,
+                           Tensor *output, const InstanceNormParams &params)
+  : KernelWithParams<InstanceNormParams>({input, gamma, beta}, {output}, params)
+{
+}
+
+void InstanceNorm::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(gamma()->element_type() == input()->element_type());
+  LUCI_INTERPRETER_CHECK(gamma()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(gamma()->shape().dim(0) == input()->shape().dim(3) ||
+                         gamma()->shape().dim(0) == 1);
+  LUCI_INTERPRETER_CHECK(beta()->element_type() == input()->element_type());
+  LUCI_INTERPRETER_CHECK(beta()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(beta()->shape().dim(0) == input()->shape().dim(3) ||
+                         beta()->shape().dim(0) == 1);
+  output()->resize(input()->shape());
+}
+
+void InstanceNorm::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void InstanceNorm::evalFloat() const
+{
+  float activation_min, activation_max;
+  calculateActivationRange(params().activation, &activation_min, &activation_max);
+  auto input_shape = getTensorShape(input());
+  auto output_shape = getTensorShape(output());
+  const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t heights = tflite::MatchingDim(input_shape, 1, output_shape, 1);
+  const int32_t widths = tflite::MatchingDim(input_shape, 2, output_shape, 2);
+  const int32_t channels = tflite::MatchingDim(input_shape, 3, output_shape, 3);
+  const float *input_data = getTensorData<float>(input());
+  const float *gamma_data = getTensorData<float>(gamma());
+  auto gamma_shape = getTensorShape(gamma());
+  bool single_gamma = gamma_shape.DimensionsCount() == 1 && gamma_shape.Dims(0) == 1;
+  const float *beta_data = getTensorData<float>(beta());
+  auto beta_shape = getTensorShape(beta());
+  bool single_beta = beta_shape.DimensionsCount() == 1 && beta_shape.Dims(0) == 1;
+  float *output_data = getTensorData<float>(output());
+  for (int32_t batch = 0; batch < batches; batch++)
+  {
+    for (int32_t channel = 0; channel < channels; channel++)
+    {
+      double sum = 0.0f;
+      double square_sum = 0.0f;
+      int32_t size = heights * widths;
+      for (int32_t height = 0; height < heights; height++)
+      {
+        for (int32_t width = 0; width < widths; width++)
+        {
+          double input_val = input_data[tflite::Offset(input_shape, batch, height, width, channel)];
+          sum += input_val;
+          square_sum += (input_val * input_val);
+        }
+      }
+      double mean = sum / size;
+      double var = square_sum / size - mean * mean;
+
+      double gamma = single_gamma ? gamma_data[0] : gamma_data[channel];
+      double beta = single_beta ? beta_data[0] : beta_data[channel];
+      double a = gamma / (std::sqrt(var + params().epsilon));
+      double b = -mean * a + beta;
+
+      for (int32_t height = 0; height < heights; height++)
+      {
+        for (int32_t width = 0; width < widths; width++)
+        {
+          double input_value =
+            input_data[tflite::Offset(output_shape, batch, height, width, channel)];
+          double output_value = input_value * a + b;
+          output_data[tflite::Offset(output_shape, batch, height, width, channel)] =
+            tflite::ActivationFunctionWithMinMax((float)output_value, activation_min,
+                                                 activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.h b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.h
new file mode 100644
index 000000000..a70a84e0a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_INSTANCENORM_H
+#define LUCI_INTERPRETER_KERNELS_INSTANCENORM_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class InstanceNorm : public KernelWithParams<InstanceNormParams>
+{
+public:
+  InstanceNorm(const Tensor *input, const Tensor *gamma, const Tensor *beta, Tensor *output,
+               const InstanceNormParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *gamma() const { return _inputs[1]; }
+  const Tensor *beta() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_INSTANCENORM_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.test.cpp
new file mode 100644
index 000000000..04400c3c0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/InstanceNorm.test.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernels/InstanceNorm.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class InstanceNormTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(InstanceNormTest, Simple)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 2, 1}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  InstanceNormParams params{};
+  params.epsilon = 0.1f;
+  params.activation = Activation::NONE;
+
+  InstanceNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear({2, 2, 2, 2}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
+}
+
+TEST_F(InstanceNormTest, Single_gamma_beta)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  InstanceNormParams params{};
+  params.epsilon = 0.1f;
+  params.activation = Activation::NONE;
+
+  InstanceNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear({2, 2, 2, 2}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 2}));
+}
+
+TEST_F(InstanceNormTest, Wrong_gamma_beta_dim_NEG)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1, 1, 1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({3}, {2, 2, 2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  InstanceNormParams params{};
+  params.epsilon = 0.1f;
+  params.activation = Activation::NONE;
+
+  InstanceNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.cpp
new file mode 100644
index 000000000..64222953f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/L2Normalize.h"
+#include "kernels/Utils.h"
+
+#include "PALL2Normalize.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+L2Normalize::L2Normalize(const Tensor *input, Tensor *output, const L2NormParams &params)
+  : KernelWithParams<L2NormParams>({input}, {output}, params)
+{
+}
+
+void L2Normalize::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32 ||
+                         output()->element_type() == DataType::U8);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (output()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(output()->scale() == (1. / 128.));
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == 128);
+  }
+  LUCI_INTERPRETER_CHECK(params().activation == Activation::NONE);
+  output()->resize(input()->shape());
+}
+
+void L2Normalize::execute() const
+{
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      eval<float>(0);
+      break;
+    case DataType::U8:
+      eval<uint8_t>(input()->zero_point());
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void L2Normalize::eval(int32_t zero_point) const
+{
+  tflite::L2NormalizationParams op_params{};
+  op_params.input_zero_point = zero_point;
+  luci_interpreter_pal::L2Normalization(op_params, getTensorShape(input()),
+                                        getTensorData<T>(input()), getTensorShape(output()),
+                                        getTensorData<T>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.h b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.h
new file mode 100644
index 000000000..6c7dac698
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_L2NORMALIZE_H
+#define LUCI_INTERPRETER_KERNELS_L2NORMALIZE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class L2Normalize : public KernelWithParams<L2NormParams>
+{
+public:
+  L2Normalize(const Tensor *input, Tensor *output, const L2NormParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void eval(int32_t zero_point) const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_L2NORMALIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.test.cpp
new file mode 100644
index 000000000..6f960e8b4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernels/L2Normalize.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  L2NormParams params{};
+  params.activation = Activation::NONE;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::pair<float, int32_t> quant_param =
+    quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 128., 128);
+
+  L2NormParams params{};
+  params.activation = Activation::NONE;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class L2NormalizeTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(L2NormalizeTest, DataTypes);
+
+TYPED_TEST(L2NormalizeTest, Simple)
+{
+  Check<TypeParam>({1, 1, 1, 6}, {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1},
+                   {-0.55, 0.3, 0.35, 0.6, -0.35, 0.05});
+}
+
+TEST(L2NormalizeTest, ActivationType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  L2NormParams params{};
+  params.activation = Activation::RELU6;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(L2NormalizeTest, InvalidOutputQuantParam_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 1, 1, 6}, 1. / 64., 127, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 64., 127);
+
+  L2NormParams params{};
+  params.activation = Activation::NONE;
+
+  L2Normalize kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.cpp
new file mode 100644
index 000000000..5a88808d5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/L2Pool2D.h"
+
+#include "kernels/Utils.h"
+
+#include "PALL2Pool2D.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+L2Pool2D::L2Pool2D(const Tensor *input, Tensor *output, const Pool2DParams &params)
+  : KernelWithParams<Pool2DParams>({input}, {output}, params)
+{
+}
+
+void L2Pool2D::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  int batches = input()->shape().dim(0);
+  int height = input()->shape().dim(1);
+  int width = input()->shape().dim(2);
+  int channels_out = input()->shape().dim(3);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params().padding;
+  int out_width, out_height;
+  out_width = computeOutputSize(padding, width, params().filter_width, params().stride_width, 1);
+  out_height =
+    computeOutputSize(padding, height, params().filter_height, params().stride_height, 1);
+  _padding_width =
+    computePadding(params().stride_width, 1, width, params().filter_width, out_width);
+  _padding_height =
+    computePadding(params().stride_height, 1, height, params().filter_height, out_height);
+
+  LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
+  output()->resize({batches, out_height, out_width, channels_out});
+}
+
+void L2Pool2D::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      float activation_min, activation_max;
+      calculateActivationRange(params().activation, &activation_min, &activation_max);
+      tflite::PoolParams op_params;
+      op_params.stride_height = params().stride_height;
+      op_params.stride_width = params().stride_width;
+      op_params.filter_height = params().filter_height;
+      op_params.filter_width = params().filter_width;
+      op_params.padding_values.height = _padding_height;
+      op_params.padding_values.width = _padding_width;
+      op_params.float_activation_min = activation_min;
+      op_params.float_activation_max = activation_max;
+      luci_interpreter_pal::L2Pool(op_params, getTensorShape(input()),
+                                   getTensorData<float>(input()), getTensorShape(output()),
+                                   getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.h b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.h
new file mode 100644
index 000000000..d40f5f478
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_L2POOL2D_H
+#define LUCI_INTERPRETER_KERNELS_L2POOL2D_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <memory>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class L2Pool2D : public KernelWithParams<Pool2DParams>
+{
+public:
+  L2Pool2D(const Tensor *input, Tensor *output, const Pool2DParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  int32_t _padding_height = 0;
+  int32_t _padding_width = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_L2POOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.test.cpp
new file mode 100644
index 000000000..7245456cb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/L2Pool2D.test.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/L2Pool2D.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class L2Pool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(L2Pool2DTest, FloatNone)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{3.5, 6.5};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatRelu)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    -1, -6, 2,  4, //
+    -3, -2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::RELU;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{3.53553, 6.5};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatRelu1)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    -0.1, -0.6, 2,  4, //
+    -0.3, -0.2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::RELU_N1_TO_1;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.353553, 1.0};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatRelu6)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    -0.1, -0.6, 2,  4, //
+    -0.3, -0.2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::RELU6;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.353553, 6.0};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatPaddingSame)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::SAME;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{3.5, 6.5};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatPaddingSameStride)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::SAME;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{3.5, 6.0, 6.5, 5.70088, 2.54951, 7.2111, 8.63134, 7.0};
+  // NOTE with NEON+ruy, error is #1=-1.14441e-05, #6=-1.81198e-05
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data, 1.0e-4f));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, FloatPaddingValidStride)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{3.5, 6.0, 6.5};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  // TODO make a Shape checking of output_tensor.
+}
+
+TEST_F(L2Pool2DTest, InvalidInputShape_NEG)
+{
+  Shape input_shape{1, 2, 4};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(L2Pool2DTest, InvalidInputOutputType_NEG)
+{
+  Shape input_shape{1, 2, 4};
+  std::vector<float> input_data{
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.activation = Activation::NONE;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+
+  L2Pool2D kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.cpp
new file mode 100644
index 000000000..3833a55e8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LeakyRelu.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+#include "PALLeakyRelu.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+LeakyRelu::LeakyRelu(const Tensor *input, Tensor *output, const LeakyReluParams &params)
+  : KernelWithParams<LeakyReluParams>({input}, {output}, params)
+{
+}
+
+void LeakyRelu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::U8)
+  {
+    double alpha_multiplier = input()->scale() * params().alpha / output()->scale();
+    quantizeMultiplier(alpha_multiplier, &_output_multiplier_alpha, &_output_shift_alpha);
+    double identity_multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
+  }
+  output()->resize(input()->shape());
+}
+
+void LeakyRelu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void LeakyRelu::evalFloat() const
+{
+  tflite::LeakyReluParams op_params{};
+  op_params.alpha = params().alpha;
+  luci_interpreter_pal::LeakyRelu(op_params, getTensorShape(input()), getTensorData<float>(input()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void LeakyRelu::evalQuantized() const
+{
+  tflite::LeakyReluParams op_params{};
+  op_params.input_offset = input()->zero_point();
+  op_params.output_offset = output()->zero_point();
+  op_params.output_multiplier_alpha = _output_multiplier_alpha;
+  op_params.output_shift_alpha = _output_shift_alpha;
+  op_params.output_multiplier_identity = _output_multiplier_identity;
+  op_params.output_shift_identity = _output_shift_identity;
+
+  tflite::reference_ops::QuantizeLeakyRelu(
+    op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(output()),
+    getTensorData<uint8_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.h b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.h
new file mode 100644
index 000000000..e66f404df
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LEAKYRELU_H
+#define LUCI_INTERPRETER_KERNELS_LEAKYRELU_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LeakyRelu : public KernelWithParams<LeakyReluParams>
+{
+public:
+  LeakyRelu(const Tensor *input, Tensor *output, const LeakyReluParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _output_multiplier_alpha = 0;
+  int _output_shift_alpha = 0;
+  int32_t _output_multiplier_identity = 0;
+  int _output_shift_identity = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LEAKYRELU_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.test.cpp
new file mode 100644
index 000000000..0f6263b57
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LeakyRelu.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data,
+           float alpha)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  LeakyReluParams params{};
+  params.alpha = alpha;
+
+  LeakyRelu kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data, float alpha)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  const float quantized_tolerance = getTolerance(-8, 127.f / 16.f, 255);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-8, 127.f / 16.f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  LeakyReluParams params{};
+  params.alpha = alpha;
+
+  LeakyRelu kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <typename T> class LeakReluTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(LeakReluTest, DataTypes);
+
+TYPED_TEST(LeakReluTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3},
+                   /*input_data=*/
+                   {
+                     0.0f, 1.0f, 3.0f,   // Row 1
+                     1.0f, -1.0f, -2.0f, // Row 2
+                   },
+                   /*output_data=*/
+                   {
+                     0.0f, 1.0f, 3.0f,   // Row 1
+                     1.0f, -0.5f, -1.0f, // Row 2
+                   },
+                   /*alpha=*/0.5f);
+
+  SUCCEED();
+}
+
+TEST(LeakReluTest, IvalidInputOutputType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3},
+                                                           {
+                                                             0.0f, 1.0f, 3.0f,   // Row 1
+                                                             1.0f, -1.0f, -2.0f, // Row 2
+                                                           },
+                                                           memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  LeakyReluParams params{};
+  params.alpha = 0.5f;
+
+  LeakyRelu kernel(&input_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Less.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Less.cpp
new file mode 100644
index 000000000..8d26ff297
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Less.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Less.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Less::Less(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void Less::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void Less::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Less::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLess(op_params, getTensorShape(x()), x_data,
+                                               getTensorShape(y()), y_data,
+                                               getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::Less(op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
+                                getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void Less::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessNoScaling(op_params, getTensorShape(x()), x_data,
+                                                        getTensorShape(y()), y_data,
+                                                        getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessNoScaling(op_params, getTensorShape(x()), x_data,
+                                         getTensorShape(y()), y_data, getTensorShape(output()),
+                                         output_data);
+  }
+}
+
+void Less::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessWithScaling(op_params, getTensorShape(x()), x_data,
+                                                          getTensorShape(y()), y_data,
+                                                          getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessWithScaling(op_params, getTensorShape(x()), x_data,
+                                           getTensorShape(y()), y_data, getTensorShape(output()),
+                                           output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Less.h b/compiler/luci-micro/luci-interpreter/src/kernels/Less.h
new file mode 100644
index 000000000..e27bb689c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Less.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LESS_H
+#define LUCI_INTERPRETER_KERNELS_LESS_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Less : public Kernel
+{
+public:
+  Less(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LESS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Less.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Less.test.cpp
new file mode 100644
index 000000000..8c5963363
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Less.test.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Less.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LessTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LessTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, false, // Row 1
+    false, false, true,  // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(LessTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, false, // Row 1
+    false, true,  true,  // Row 2
+    true,  true,  false, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value + 1, -2, max_value};
+
+  std::vector<bool> ref_output_data{true, false, false};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,         // Row 1
+    4,         5,  max_value, // Row 2
+    -1,        -4, -3,        // Row 3
+    min_value, -2, max_value, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value + 1, -2, max_value - 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, true,  // Row 1
+    false, false, false, // Row 2
+    false, true,  true,  // Row 3
+    true,  false, false, // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(LessTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(LessTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(LessTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, false, false, // Row 1
+    false, true,  false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, false, false, // Row 1
+    false, true,  false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  false, true, // Row 1
+    false, false, false, true, // Row 2
+    false, true,  false, true, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Less kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.cpp
new file mode 100644
index 000000000..b474bc47a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LessEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+LessEqual::LessEqual(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void LessEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void LessEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void LessEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessEqual(op_params, getTensorShape(x()), x_data,
+                                                    getTensorShape(y()), y_data,
+                                                    getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                     y_data, getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void LessEqual::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                                             getTensorShape(y()), y_data,
+                                                             getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                              getTensorShape(y()), y_data, getTensorShape(output()),
+                                              output_data);
+  }
+}
+
+void LessEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowLessEqualWithScaling(
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
+  }
+  else
+  {
+    tflite::reference_ops::LessEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                                getTensorShape(y()), y_data,
+                                                getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.h b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.h
new file mode 100644
index 000000000..f82ea90d4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LessEqual : public Kernel
+{
+public:
+  LessEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LESS_EQUAL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.test.cpp
new file mode 100644
index 000000000..b2e2fa7a1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LessEqual.test.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LessEqual.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LessEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LessEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  true, false, // Row 1
+    false, true, true,  // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(LessEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  true, false, // Row 1
+    false, true, true,  // Row 2
+    true,  true, false, // Row 3
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value + 1, -2, max_value};
+
+  std::vector<bool> ref_output_data{true, false, true};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,         // Row 1
+    4,         5,  max_value, // Row 2
+    -1,        -4, -3,        // Row 3
+    min_value, -2, max_value, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value + 1, -2, max_value - 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, true,  // Row 1
+    false, false, false, // Row 2
+    false, true,  true,  // Row 3
+    true,  true,  false, // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(LessEqualTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(LessEqualTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(LessEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  true, false, false, // Row 1
+    false, true, false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessEqualTest, Uint8QuantizedRescale)
+{
+  std::vector<float> x_data{
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  true, false, false, // Row 1
+    false, true, false, true,  // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
+
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  false, true, // Row 1
+    false, false, true,  true, // Row 2
+    false, true,  false, true, // Row 3
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(LessEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessEqualTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessEqualTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LessEqualTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
new file mode 100644
index 000000000..a2bf442b0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LocalResponseNormalization.h"
+
+#include "kernels/Utils.h"
+
+#include "PALLocalResponseNormalization.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+LocalResponseNormalization::LocalResponseNormalization(
+  const Tensor *input, Tensor *output, const LocalResponseNormalizationParams &params)
+  : KernelWithParams<LocalResponseNormalizationParams>({input}, {output}, params)
+{
+}
+
+void LocalResponseNormalization::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void LocalResponseNormalization::execute() const
+{
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::LocalResponseNormalizationParams op_params;
+      op_params.range = params().radius;
+      op_params.bias = params().bias;
+      op_params.alpha = params().alpha;
+      op_params.beta = params().beta;
+      luci_interpreter_pal::LocalResponseNormalization(
+        op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(output()),
+        getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.h b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.h
new file mode 100644
index 000000000..60408a104
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOCALRESPONSENORMALIZATION_H
+#define LUCI_INTERPRETER_KERNELS_LOCALRESPONSENORMALIZATION_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LocalResponseNormalization : public KernelWithParams<LocalResponseNormalizationParams>
+{
+public:
+  LocalResponseNormalization(const Tensor *input, Tensor *output,
+                             const LocalResponseNormalizationParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOCALRESPONSENORMALIZATION_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
new file mode 100644
index 000000000..4a9d4739f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LocalResponseNormalization.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LocalResponseNormalizationTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LocalResponseNormalizationTest, SameAsL2Norm)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 1.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
+}
+
+TEST_F(LocalResponseNormalizationTest, WithAlpha)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 4.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({-0.275, 0.15, 0.175, 0.3, -0.175, 0.025}));
+}
+
+TEST_F(LocalResponseNormalizationTest, WithBias)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 9.0;
+  params.alpha = 4.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02}));
+}
+
+TEST_F(LocalResponseNormalizationTest, SmallRadius)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 2;
+  params.bias = 9.0;
+  params.alpha = 4.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear({-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266}));
+}
+
+TEST_F(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 1.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LocalResponseNormalizationTest, InvalidInputOutputType_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = 20;
+  params.bias = 0.0;
+  params.alpha = 1.0;
+  params.beta = 0.5;
+
+  LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.cpp
new file mode 100644
index 000000000..79c315338
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogSoftmax.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/log_softmax.h>
+
+#include "PALLogSoftmax.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+LogSoftmax::LogSoftmax(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void LogSoftmax::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(output()->scale() == 16. / 256);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == 255);
+
+    tflite::SoftmaxParams params{};
+
+    params.table = _table;
+    params.beta = 1.0;
+    luci_interpreter_pal::PopulateSoftmaxLookupTable(&params, input()->scale(), params.beta);
+  }
+  output()->resize(input()->shape());
+}
+
+void LogSoftmax::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void LogSoftmax::evalFloat() const
+{
+  tflite::SoftmaxParams params{};
+  tflite::reference_ops::LogSoftmax(params, getTensorShape(input()), getTensorData<float>(input()),
+                                    getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void LogSoftmax::evalQuantized() const
+{
+  const auto input_shape = getTensorShape(input());
+  const auto output_shape = getTensorShape(output());
+  const auto input_scale = input()->scale();
+  uint8_t *output_data = getTensorData<uint8_t>(output());
+  const uint8_t *input_data = getTensorData<uint8_t>(input());
+  const float beta = 1.0;
+
+  tflite::SoftmaxParams params{};
+
+  params.table = const_cast<float *>(_table);
+  params.zero_point = output()->zero_point();
+  params.scale = output()->scale();
+
+  luci_interpreter_pal::InitializeParams(&params, input_scale, beta);
+  luci_interpreter_pal::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
+                                   output_data);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.h b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.h
new file mode 100644
index 000000000..18477fbe3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
+#define LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LogSoftmax : public Kernel
+{
+public:
+  LogSoftmax(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+  float _table[256];
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGSOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.test.cpp
new file mode 100644
index 000000000..50dcd5c28
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogSoftmax.test.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogSoftmax.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LogSoftmaxTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogSoftmaxTest, Float)
+{
+  Shape input_shape{2, 4};
+  std::vector<float> input_data{
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    -4.14297, -10.14297, -2.14297,   -.142971, //
+    -7.00104, -12.00104, -.00104087, -9.00104, //
+  };
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(LogSoftmaxTest, Uint8)
+{
+  float kMin = -10;
+  float kMax = 10;
+  float kLogSoftmaxQuantizedTolerance = 16. / 256;
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
+  std::vector<float> input_data{
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    -4.14297, -10.14297, -2.14297,   -.142971, //
+    -7.00104, -12.00104, -.00104087, -9.00104, //
+  };
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kLogSoftmaxQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
+}
+
+TEST_F(LogSoftmaxTest, InvalidInputOutputType_NEG)
+{
+  std::vector<float> input_data{
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 4}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LogSoftmaxTest, InvalidOutputQuantParam_NEG)
+{
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-10, 10);
+  std::vector<float> input_data{
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 20. / 256, 255);
+
+  LogSoftmax kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.cpp
new file mode 100644
index 000000000..8e7263231
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalAnd.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+LogicalAnd::LogicalAnd(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void LogicalAnd::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void LogicalAnd::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::BOOL:
+      evalLogicalAnd();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+inline void LogicalAnd::evalLogicalAnd() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<bool>(input1()),
+                        getTensorShape(input2()), getTensorData<bool>(input2()),
+                        getTensorShape(output()), getTensorData<bool>(output()),
+                        [](bool x, bool y) { return x && y; });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.h b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.h
new file mode 100644
index 000000000..46b889986
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGICALAND_H
+#define LUCI_INTERPRETER_KERNELS_LOGICALAND_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LogicalAnd : public Kernel
+{
+public:
+  LogicalAnd(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  inline void evalLogicalAnd() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGICALAND_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.test.cpp
new file mode 100644
index 000000000..21b7951e0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalAnd.test.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalAnd.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LogicalAndTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalAndTest, Basic)
+{
+  Shape input_shape{1, 1, 1, 4};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true}, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, true, false}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalAnd kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor),
+              ::testing::ElementsAre(true, false, false, false));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
+}
+
+TEST_F(LogicalAndTest, Broadcast)
+{
+  Tensor input_tensor1 = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {true}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalAnd kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor),
+              ::testing::ElementsAre(true, false, false, true));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
+}
+
+TEST_F(LogicalAndTest, MismatchInputType_NEG)
+{
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  LogicalAnd kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LogicalAndTest, InputTypeInvalid_NEG)
+{
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalAnd kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.cpp
new file mode 100644
index 000000000..65ab961aa
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalNot.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+LogicalNot::LogicalNot(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void LogicalNot::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  output()->resize(input()->shape());
+}
+
+void LogicalNot::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::BOOL:
+      evalLogicalNot();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+inline void LogicalNot::evalLogicalNot() const
+{
+  const int size = tflite::MatchingFlatSize(getTensorShape(input()), getTensorShape(output()));
+  bool *output_data = getTensorData<bool>(output());
+  const bool *input_data = getTensorData<bool>(input());
+  for (int i = 0; i < size; ++i)
+  {
+    output_data[i] = !input_data[i];
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.h b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.h
new file mode 100644
index 000000000..1608fafa5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGICALNOT_H
+#define LUCI_INTERPRETER_KERNELS_LOGICALNOT_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LogicalNot : public Kernel
+{
+public:
+  LogicalNot(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  inline void evalLogicalNot() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGICALNOT_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.test.cpp
new file mode 100644
index 000000000..3cbf27f6b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalNot.test.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalNot.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LogicalNotTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalNotTest, Basic)
+{
+  Shape input_shape{1, 1, 1, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalNot kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor),
+              ::testing::ElementsAre(false, true, true, false));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
+}
+
+TEST_F(LogicalNotTest, OutputTypeInvalid_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                        _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  LogicalNot kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LogicalNotTest, InputTypeInvalid_NEG)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalNot kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.cpp
new file mode 100644
index 000000000..f289ca64f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalOr.h"
+
+#include "kernels/Utils.h"
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+LogicalOr::LogicalOr(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void LogicalOr::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == DataType::BOOL);
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void LogicalOr::execute() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<bool>(input1()),
+                        getTensorShape(input2()), getTensorData<bool>(input2()),
+                        getTensorShape(output()), getTensorData<bool>(output()),
+                        [](bool x, bool y) { return x || y; });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.h b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.h
new file mode 100644
index 000000000..88606483f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGICALOR_H
+#define LUCI_INTERPRETER_KERNELS_LOGICALOR_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class LogicalOr : public Kernel
+{
+public:
+  LogicalOr(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGICALOR_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.test.cpp
new file mode 100644
index 000000000..d65a69a5e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/LogicalOr.test.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/LogicalOr.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class LogicalOrTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalOrTest, Basic)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, true, false},
+                                                         _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor),
+              ::testing::ElementsAre(true, false, true, true));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
+}
+
+TEST_F(LogicalOrTest, Broadcast)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor),
+              ::testing::ElementsAre(true, false, false, true));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
+}
+
+TEST_F(LogicalOrTest, MismatchInputType_NEG)
+{
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(LogicalOrTest, InputTypeInvalid_NEG)
+{
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0}, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.cpp
new file mode 100644
index 000000000..58e4f185d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Logistic.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/logistic.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Logistic::Logistic(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Logistic::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(output()->scale() == 1. / 256);
+    populateLookupTable();
+  }
+  output()->resize(input()->shape());
+}
+
+void Logistic::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Logistic::evalFloat() const
+{
+  tflite::reference_ops::Logistic(getTensorShape(input()), getTensorData<float>(input()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void Logistic::evalQuantized() const
+{
+  const int size = tflite::MatchingFlatSize(getTensorShape(input()), getTensorShape(output()));
+  uint8_t *output_data = getTensorData<uint8_t>(output());
+  const uint8_t *input_data = getTensorData<uint8_t>(input());
+  for (int i = 0; i < size; ++i)
+  {
+    output_data[i] = getTableValue(input_data[i]);
+  }
+}
+
+void Logistic::populateLookupTable()
+{
+  const auto input_scale = static_cast<double>(input()->scale());
+  const auto input_zero_point = static_cast<int32_t>(input()->zero_point());
+  const auto output_scale = static_cast<double>(output()->scale());
+  const auto output_zero_point = static_cast<int32_t>(output()->zero_point());
+  const float inverse_scale = 1 / output_scale;
+  int32_t maxval = std::numeric_limits<uint8_t>::max();
+  int32_t minval = std::numeric_limits<uint8_t>::min();
+  for (int32_t val = minval; val <= maxval; ++val)
+  {
+    const float dequantized = input_scale * (val - input_zero_point);
+    const float transformed = 1.0f / (1.0f + std::exp(-dequantized));
+    const float rescaled = std::round(transformed * inverse_scale);
+    const int32_t quantized = static_cast<int32_t>(rescaled + output_zero_point);
+    setTableValue(static_cast<uint8_t>(std::max(std::min(maxval, quantized), minval)),
+                  static_cast<uint8_t>(val));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.h b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.h
new file mode 100644
index 000000000..31de6adf0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_LOGISTIC_H
+#define LUCI_INTERPRETER_KERNELS_LOGISTIC_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Logistic : public Kernel
+{
+public:
+  Logistic(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void populateLookupTable();
+  void setTableValue(uint8_t value, uint8_t idx) { _table[idx] = value; };
+  uint8_t getTableValue(uint8_t idx) const { return _table[idx]; };
+
+private:
+  uint8_t _table[256]{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_LOGISTIC_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.test.cpp
new file mode 100644
index 000000000..5a1ea669c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Logistic.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<getElementType<T>()>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(getElementType<T>());
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::pair<float, int32_t> input_quant_param =
+    quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale() * 2));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class LogisticTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(LogisticTest, DataTypes);
+
+TYPED_TEST(LogisticTest, Simple)
+{
+  Check<TypeParam>(
+    {89}, {89},
+    {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182, -9.0909090909, -8.8636363636,
+     -8.6363636364,  -8.4090909091, -8.1818181818, -7.9545454545, -7.7272727273, -7.5000000000,
+     -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909, -6.3636363636, -6.1363636364,
+     -5.9090909091,  -5.6818181818, -5.4545454545, -5.2272727273, -5.0000000000, -4.7727272727,
+     -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636, -3.6363636364, -3.4090909091,
+     -3.1818181818,  -2.9545454545, -2.7272727273, -2.5000000000, -2.2727272727, -2.0454545455,
+     -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364, -0.9090909091, -0.6818181818,
+     -0.4545454545,  -0.2272727273, 0.0000000000,  0.2272727273,  0.4545454545,  0.6818181818,
+     0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,  1.8181818182,  2.0454545455,
+     2.2727272727,   2.5000000000,  2.7272727273,  2.9545454545,  3.1818181818,  3.4090909091,
+     3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,  4.5454545455,  4.7727272727,
+     5.0000000000,   5.2272727273,  5.4545454545,  5.6818181818,  5.9090909091,  6.1363636364,
+     6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,  7.2727272727,  7.5000000000,
+     7.7272727273,   7.9545454545,  8.1818181818,  8.4090909091,  8.6363636364,  8.8636363636,
+     9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,  10.0000000000},
+    {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729, 0.0001414198,
+     0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396, 0.0004404502, 0.0005527786,
+     0.0006937345, 0.0008706021, 0.0010925128, 0.0013709094, 0.0017201256, 0.0021581065,
+     0.0027073042, 0.0033957870, 0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576,
+     0.0105038445, 0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+     0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047, 0.1145124805,
+     0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272, 0.2871859014, 0.3358556241,
+     0.3882805886, 0.4434251301, 0.5000000000, 0.5565748699, 0.6117194114, 0.6641443759,
+     0.7128140986, 0.7570113728, 0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195,
+     0.9065929953, 0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+     0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555, 0.9916136424,
+     0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130, 0.9972926958, 0.9978418935,
+     0.9982798744, 0.9986290906, 0.9989074872, 0.9991293979, 0.9993062655, 0.9994472214,
+     0.9995595498, 0.9996490604, 0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802,
+     0.9998873271, 0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021});
+}
+
+TEST(LogisticTest, IvalidInputOutputType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape = {1};
+  std::vector<float> input_data{10};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(LogisticTest, IvalidQuantParam_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Shape input_shape = {2};
+  std::vector<float> input_data{-10, 10};
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-10, 10);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 255, 0);
+
+  Logistic kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.cpp
new file mode 100644
index 000000000..8d9760ff2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/MaxPool2D.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+MaxPool2D::MaxPool2D(const Tensor *input, Tensor *output, const Pool2DParams &params)
+  : KernelWithParams<Pool2DParams>({input}, {output}, params)
+{
+}
+
+void MaxPool2D::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  assert(input()->shape().num_dims() == 4);
+  const Shape &input_shape = input()->shape();
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t depth = input_shape.dim(3);
+
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, _params.filter_height, _params.stride_height);
+  const int32_t output_width =
+    computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
+
+  _padding_height =
+    computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
+  _padding_width =
+    computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
+
+  output()->resize({batches, output_height, output_width, depth});
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+}
+
+void MaxPool2D::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalSInt16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void MaxPool2D::evalFloat() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+
+  tflite::reference_ops::MaxPool(params, getTensorShape(input()), getTensorData<float>(input()),
+                                 getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void MaxPool2D::evalQuantized() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_ops::MaxPool(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                                 getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+void MaxPool2D::evalSInt16() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_integer_ops::MaxPool(
+    params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+    getTensorShape(output()), getTensorData<int16_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.h b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.h
new file mode 100644
index 000000000..bb7666305
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MAXPOOL2D_H
+#define LUCI_INTERPRETER_KERNELS_MAXPOOL2D_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class MaxPool2D : public KernelWithParams<Pool2DParams>
+{
+public:
+  MaxPool2D(const Tensor *input, Tensor *output, const Pool2DParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalSInt16() const;
+
+private:
+  int32_t _padding_height{};
+  int32_t _padding_width{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MAXPOOL2D_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.test.cpp
new file mode 100644
index 000000000..44f2a222f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MaxPool2D.test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/MaxPool2D.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MaxPool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MaxPool2DTest, Float)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<float> input_data{
+    1,  -1, 0,  -2, 2,  //
+    -7, -6, -5, -4, -3, //
+    5,  4,  3,  6,  7,  //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  MaxPool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    1, 2, //
+    5, 6, //
+  };
+  std::initializer_list<int32_t> ref_output_shape{1, 2, 2, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MaxPool2DTest, Uint8)
+{
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375, 15.9375);
+  std::vector<float> input_data{
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
+  };
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 2;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  MaxPool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.0, 6.0};
+  std::initializer_list<int32_t> ref_output_shape{1, 1, 2, 1};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MaxPool2DTest, SInt16)
+{
+  Shape input_shape{1, 3, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{
+    1,  -1, 0,  -2, 2,  //
+    -7, -6, -5, -4, -3, //
+    5,  4,  3,  6,  7,  //
+  };
+  std::vector<float> ref_output_data{
+    1, 2, //
+    5, 6, //
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.2, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 1;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  MaxPool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.cpp
new file mode 100644
index 000000000..b102b5e27
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Maximum.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Maximum::Maximum(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void Maximum::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Maximum::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalMaximum<float>();
+      break;
+    case DataType::U8:
+      evalMaximum<uint8_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void Maximum::evalMaximum() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()),
+                        [](T x, T y) { return std::max(x, y); });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.h b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.h
new file mode 100644
index 000000000..3c99e69c7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MAXIMUM_H
+#define LUCI_INTERPRETER_KERNELS_MAXIMUM_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Maximum : public Kernel
+{
+public:
+  Maximum(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalMaximum() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MAXIMUM_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.test.cpp
new file mode 100644
index 000000000..e4a505b03
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Maximum.test.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Maximum.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MaximumTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MaximumTest, Float)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{1.0, 0.0, 1.0, 12.0, -2.0, -1.43};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(MaximumTest, Uint8)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
+  std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::U8>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::U8>(input_shape, input_data2, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({1, 0, 2, 12, 255, 23}));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mean.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.cpp
new file mode 100644
index 000000000..8e65e0d6d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Mean.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reduce.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+static void resolveAxes(const int32_t *axes_data, int num_axes, tflite::MeanParams *params)
+{
+  params->axis_count = num_axes;
+  for (int i = 0; i < num_axes; ++i)
+  {
+    params->axis[i] = static_cast<int16>(axes_data[i]);
+  }
+  for (int i = num_axes; i < 4; ++i)
+  {
+    params->axis[i] = 1;
+  }
+}
+
+// Returns the number of axes that will be reduced. Removes duplicates.
+static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims)
+{
+  int reduction_count = num_axes;
+  for (int i = 0; i < num_axes; ++i)
+  {
+    int current = axes_data[i] >= 0 ? axes_data[i] : axes_data[i] + input_num_dims;
+    assert(current >= 0 && current < input_num_dims);
+    for (int j = 0; j < i; j++)
+    {
+      int previous = axes_data[j] >= 0 ? axes_data[j] : axes_data[j] + input_num_dims;
+      // This checks for duplicate axis
+      if (current == previous)
+      {
+        --reduction_count;
+        break;
+      }
+    }
+  }
+  return reduction_count;
+}
+
+static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes,
+                            bool keep_dims)
+{
+  int input_num_dims = input_shape.num_dims();
+  if (input_num_dims == 0)
+  {
+    return Shape(0);
+  }
+
+  if (keep_dims)
+  {
+    Shape output_shape(input_num_dims);
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          is_axis = true;
+          break;
+        }
+      }
+      if (is_axis)
+      {
+        output_shape.dim(idx) = 1;
+      }
+      else
+      {
+        output_shape.dim(idx) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+  else
+  {
+    int num_reduce_axes = getAxisReductionCount(axes_data, num_axes, input_num_dims);
+    Shape output_shape(input_num_dims - num_reduce_axes);
+    int num_skip_axes = 0;
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          ++num_skip_axes;
+          is_axis = true;
+          break;
+        }
+      }
+      if (!is_axis)
+      {
+        output_shape.dim(idx - num_skip_axes) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+}
+
+Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+           Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params)
+  : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes, temp_sum},
+                                    params)
+{
+}
+
+void Mean::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32);
+  if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+
+  const Shape &input_shape = input()->shape();
+  int input_num_dims = input_shape.num_dims();
+
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+  assert(num_axes <= 4);
+
+  Shape output_shape = getOutputShape(input_shape, axes_data, num_axes, _params.keep_dims);
+  output()->resize(output_shape);
+
+  tflite::MeanParams params{};
+  resolveAxes(axes_data, num_axes, &params);
+  _need_temporaries = !(
+    _params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
+    ((params.axis[0] == 1 && params.axis[1] == 2) || (params.axis[0] == 2 && params.axis[1] == 1)));
+  if (_need_temporaries)
+  {
+    auto temp_index = getOutputTensors()[1];
+    auto resolved_axes = getOutputTensors()[2];
+    auto temp_sum = getOutputTensors()[3];
+
+    temp_index->resize(Shape(input_num_dims));
+    resolved_axes->resize(Shape(num_axes));
+    temp_sum->resize(output()->shape());
+  }
+  else
+  {
+    auto temp_index = getOutputTensors()[1];
+    auto resolved_axes = getOutputTensors()[2];
+    auto temp_sum = getOutputTensors()[3];
+
+    temp_index->set_allocatable(false);
+    resolved_axes->set_allocatable(false);
+    temp_sum->set_allocatable(false);
+  }
+}
+
+void Mean::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Mean::evalFloat() const
+{
+  const Shape &input_shape = input()->shape();
+  int input_num_dims = input_shape.num_dims();
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+
+  tflite::MeanParams params{};
+  resolveAxes(axes_data, num_axes, &params);
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+  auto temp_sum = getOutputTensors()[3];
+
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
+      ((params.axis[0] == 1 && params.axis[1] == 2) ||
+       (params.axis[0] == 2 && params.axis[1] == 1)))
+  {
+    tflite::reference_ops::Mean(params, getTensorShape(input()), getTensorData<float>(input()),
+                                getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Mean(getTensorData<float>(input()), getTensorShape(input()).DimsData(),
+                                input()->shape().num_dims(), getTensorData<float>(output()),
+                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
+                                axes_data, num_axes, _params.keep_dims,
+                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+                                getTensorData<float>(temp_sum));
+  }
+}
+
+void Mean::evalQuantized() const
+{
+  const Shape &input_shape = input()->shape();
+  int input_num_dims = input_shape.num_dims();
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+
+  tflite::MeanParams params{};
+  resolveAxes(axes_data, num_axes, &params);
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+  auto temp_sum = getOutputTensors()[3];
+
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
+      ((params.axis[0] == 1 && params.axis[1] == 2) ||
+       (params.axis[0] == 2 && params.axis[1] == 1)))
+  {
+    tflite::reference_ops::Mean(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                                input()->zero_point(), input()->scale(), getTensorShape(output()),
+                                getTensorData<uint8_t>(output()), output()->zero_point(),
+                                output()->scale());
+  }
+  else if (input()->zero_point() == output()->zero_point() && input()->scale() == output()->scale())
+  {
+    tflite::reference_ops::Mean(getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
+                                input()->shape().num_dims(), getTensorData<uint8_t>(output()),
+                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
+                                axes_data, num_axes, _params.keep_dims,
+                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+                                getTensorData<int>(temp_sum));
+  }
+  else
+  {
+    tflite::reference_ops::QuantizedMeanOrSum<>(
+      getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(),
+      getTensorShape(input()).DimsData(), input()->shape().num_dims(),
+      getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(),
+      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
+      _params.keep_dims, getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+      getTensorData<int>(temp_sum),
+      /*compute_sum=*/false);
+  }
+}
+
+void Mean::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  const Shape &input_shape = input()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  const int num_axes = axes()->shape().num_elements();
+
+  constexpr int32_t output_min = -std::numeric_limits<int16_t>::max();
+  constexpr int32_t output_max = std::numeric_limits<int16_t>::max();
+
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (_params.keep_dims && input_shape.num_dims() == 4 && num_axes == 2 &&
+      ((axes_data[0] == 1 && axes_data[1] == 2) || (axes_data[0] == 2 && axes_data[1] == 1)))
+  {
+    const int32_t batches = input_shape.dim(0);
+    const int32_t input_height = input_shape.dim(1);
+    const int32_t input_width = input_shape.dim(2);
+    const int32_t depth = input_shape.dim(3);
+    assert(output_shape.num_dims() == 4);
+    assert(output_shape.dim(0) == batches);
+    assert(output_shape.dim(1) == 1);
+    assert(output_shape.dim(2) == 1);
+    assert(output_shape.dim(3) == depth);
+
+    const double real_multiplier =
+      static_cast<double>(input()->scale()) / static_cast<double>(output()->scale());
+
+    int32_t output_multiplier{};
+    int output_shift{};
+    quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+    const int32_t num_elements_in_axes = input_height * input_width;
+
+    for (int32_t batch = 0; batch < batches; ++batch)
+    {
+      for (int32_t c = 0; c < depth; ++c)
+      {
+        int32_t acc = 0;
+        for (int32_t in_y = 0; in_y < input_height; ++in_y)
+        {
+          for (int32_t in_x = 0; in_x < input_width; ++in_x)
+          {
+            acc += input_data[calcOffset(input_shape, batch, in_y, in_x, c)];
+          }
+        }
+        int32_t scaled_acc =
+          tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+        // Divide by the number of elements rounding to the nearest integer.
+        scaled_acc = scaled_acc > 0
+                       ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes
+                       : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes;
+
+        scaled_acc = std::max(scaled_acc, output_min);
+        scaled_acc = std::min(scaled_acc, output_max);
+
+        output_data[calcOffset(output_shape, batch, 0, 0, c)] = scaled_acc;
+      }
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported configuration.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mean.h b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.h
new file mode 100644
index 000000000..ed07ae561
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MEAN_H
+#define LUCI_INTERPRETER_KERNELS_MEAN_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <memory>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Mean : public KernelWithParams<ReducerParams>
+{
+public:
+  Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+       Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+
+private:
+  bool _need_temporaries = false;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MEAN_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mean.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.test.cpp
new file mode 100644
index 000000000..d2c00935a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mean.test.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Mean.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MeanTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MeanTest, FloatKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{0, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{10.5, 12.5, 14.5};
+  std::initializer_list<int32_t> ref_output_shape{1, 3, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MeanTest, FloatKeepDims4DMean)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 2, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{6, 7, 18, 19};
+  std::initializer_list<int32_t> ref_output_shape{2, 1, 1, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MeanTest, FloatNotKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{1, 0, -3, -3};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = false;
+
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{12, 13};
+  std::initializer_list<int32_t> ref_output_shape{2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MeanTest, Uint8KeepDims)
+{
+  float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
+  std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+
+  std::vector<int32_t> axis_data{1};
+  Tensor input_tensor = makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::U8, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.3, 0.35, 0.55};
+  std::initializer_list<int32_t> ref_output_shape{3, 1};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MeanTest, Uint8NotKeepDims)
+{
+  float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
+  std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+
+  std::vector<int32_t> axis_data{1};
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 2}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  ReducerParams params{};
+  params.keep_dims = false;
+
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.4, 0.4};
+  std::initializer_list<int32_t> ref_output_shape{1, 2};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MeanTest, SInt16KeepDims4D)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  std::vector<int32_t> axes_data{1, 2};
+  std::vector<float> ref_output_data{6, 7, 18, 19};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({2, 2, 3, 2}, 0.25, 0, input_data, _memory_manager.get());
+  Tensor axes_tensor = makeInputTensor<DataType::S32>({2}, axes_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  Mean kernel(&input_tensor, &axes_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.cpp
new file mode 100644
index 000000000..5d3dcde72
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Minimum.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Minimum::Minimum(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void Minimum::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Minimum::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalMinimum<float>();
+      break;
+    case DataType::U8:
+      evalMinimum<uint8_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void Minimum::evalMinimum() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()),
+                        [](T x, T y) { return std::min(x, y); });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.h b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.h
new file mode 100644
index 000000000..5ff4035b4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MINIMUM_H
+#define LUCI_INTERPRETER_KERNELS_MINIMUM_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Minimum : public Kernel
+{
+public:
+  Minimum(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalMinimum() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MINIMUM_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.test.cpp
new file mode 100644
index 000000000..9a143643f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Minimum.test.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Minimum.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MinimumTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MinimumTest, Float)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{-1.0, 0.0, -1.0, 11.0, -3.0, -1.44};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(MinimumTest, Uint8)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
+  std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::U8>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::U8>(input_shape, input_data2, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<int32_t> ref_output_shape{2, 4};
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 1, 11, 2, 1}));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.cpp
new file mode 100644
index 000000000..bae1eac70
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/MirrorPad.h"
+
+#include "kernels/Utils.h"
+
+#include <limits>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+MirrorPad::MirrorPad(const Tensor *input, const Tensor *paddings, Tensor *output,
+                     const MirrorPadParams &params)
+  : KernelWithParams<MirrorPadParams>({input, paddings}, {output}, params)
+{
+}
+
+void MirrorPad::configure()
+{
+  const Shape &input_shape = input()->shape();
+  const int num_dims = input_shape.num_dims();
+
+  if (num_dims > 4)
+    throw std::runtime_error("Unsupported number of dimensions.");
+
+  assert(output()->element_type() == input()->element_type());
+  assert(paddings()->element_type() == DataType::S32);
+  // Paddings shape should be [N, 2].
+  assert(paddings()->shape().num_dims() == 2);
+  assert(paddings()->shape().dim(0) == num_dims);
+  assert(paddings()->shape().dim(1) == 2);
+
+  Shape output_shape(num_dims);
+  const auto *paddings_data = getTensorData<int32_t>(paddings());
+  for (int i = 0; i < num_dims; ++i)
+  {
+    const int32_t padding_before = paddings_data[i * 2];
+    const int32_t padding_after = paddings_data[i * 2 + 1];
+    assert(padding_before >= 0 && padding_after >= 0);
+    output_shape.dim(i) = input_shape.dim(i) + padding_before + padding_after;
+  }
+
+  output()->resize(output_shape);
+}
+
+template <typename T>
+inline void MirrorPadImpl(const Tensor &input, const Tensor &paddings, MirrorPadMode mode,
+                          Tensor &output);
+
+void MirrorPad::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+    {
+      MirrorPadImpl<float>(*input(), *paddings(), params().mode, *output());
+      break;
+    }
+    case DataType::U8:
+    {
+      assert(output()->zero_point() >= std::numeric_limits<uint8_t>::min());
+      assert(output()->zero_point() <= std::numeric_limits<uint8_t>::max());
+
+      MirrorPadImpl<uint8_t>(*input(), *paddings(), params().mode, *output());
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T>
+inline void MirrorPadImpl(const Tensor &input, const Tensor &paddings, MirrorPadMode mode,
+                          Tensor &output)
+{
+  auto const input_dims = input.shape().num_dims();
+  auto const input_data = input.data<T>();
+  auto const paddings_data = paddings.data<int32_t>();
+  auto const output_data = output.data<T>();
+
+  auto const input_b = input_dims > 3 ? input.shape().dim(input_dims - 4) : 1;
+  auto const input_h = input_dims > 2 ? input.shape().dim(input_dims - 3) : 1;
+  auto const input_w = input_dims > 1 ? input.shape().dim(input_dims - 2) : 1;
+  auto const input_d = input.shape().dim(input_dims - 1);
+
+  auto const input_h_offset = input_d * input_w;
+  auto const input_b_offset = input_h_offset * input_h;
+
+  auto const output_b = input_dims > 3 ? output.shape().dim(input_dims - 4) : 1;
+  auto const output_h = input_dims > 2 ? output.shape().dim(input_dims - 3) : 1;
+  auto const output_w = input_dims > 1 ? output.shape().dim(input_dims - 2) : 1;
+  auto const output_d = output.shape().dim(input_dims - 1);
+
+  auto const left_b_pad = paddings_data[2 * (input_dims - 4)];
+  auto const left_h_pad = paddings_data[2 * (input_dims - 3)];
+  auto const left_w_pad = paddings_data[2 * (input_dims - 2)];
+  auto const left_d_pad = paddings_data[2 * (input_dims - 1)];
+
+  auto const right_b_pad = paddings_data[2 * (input_dims - 4) + 1];
+  auto const right_h_pad = paddings_data[2 * (input_dims - 3) + 1];
+  auto const right_w_pad = paddings_data[2 * (input_dims - 2) + 1];
+  auto const right_d_pad = paddings_data[2 * (input_dims - 1) + 1];
+
+  const auto positive_mod = [](auto a, auto b) { return (a % b + b) % b; };
+  const auto offset_index = [input_d, input_h_offset, input_b_offset](auto d, auto w, auto h,
+                                                                      auto b) {
+    return d + w * input_d + h * input_h_offset + b * input_b_offset;
+  };
+
+  const auto symmetric_dim = [&positive_mod](auto i, auto left_pad, auto input) {
+    bool reflected = (((i < left_pad ? i + 1 - input : i) - left_pad) / input & 1) == 1;
+    return positive_mod(reflected ? input + left_pad - i - 1 : i - left_pad, input);
+  };
+
+  const T *in_ptr = input_data;
+  T *out_ptr = output_data;
+
+  for (int32_t b = 0; b < output_b; ++b)
+  {
+    for (int32_t h = 0; h < output_h; ++h)
+    {
+      for (int32_t w = 0; w < output_w; ++w)
+      {
+        for (int32_t d = 0; d < output_d; ++d)
+        {
+          if (b < left_b_pad || b >= output_b - right_b_pad || //
+              h < left_h_pad || h >= output_h - right_h_pad || //
+              w < left_w_pad || w >= output_w - right_w_pad || //
+              d < left_d_pad || d >= output_d - right_d_pad)
+          {
+            if (mode == MirrorPadMode::REFLECT)
+            {
+              *out_ptr++ = input_data[offset_index(
+                positive_mod(d - left_d_pad, input_d), positive_mod(w - left_w_pad, input_w),
+                positive_mod(h - left_h_pad, input_h), positive_mod(b - left_b_pad, input_b))];
+            }
+            else
+            {
+              *out_ptr++ = input_data[offset_index(
+                symmetric_dim(d, left_d_pad, input_d), symmetric_dim(w, left_w_pad, input_w),
+                symmetric_dim(h, left_h_pad, input_h), symmetric_dim(b, left_b_pad, input_b))];
+            }
+          }
+          else
+          {
+            *out_ptr++ = *in_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.h b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.h
new file mode 100644
index 000000000..d3e6e858a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MIRROR_PAD_H
+#define LUCI_INTERPRETER_KERNELS_MIRROR_PAD_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class MirrorPad : public KernelWithParams<MirrorPadParams>
+{
+public:
+  MirrorPad(const Tensor *input, const Tensor *paddings, Tensor *output,
+            const MirrorPadParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *paddings() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MIRROR_PAD_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.test.cpp
new file mode 100644
index 000000000..740d8cb22
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/MirrorPad.test.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/MirrorPad.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MirrorPadTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  void Execute(const Tensor &input, const Tensor &padding, Tensor &output, MirrorPadMode mode)
+  {
+    MirrorPadParams params{};
+    params.mode = mode;
+
+    MirrorPad kernel(&input, &padding, &output, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output);
+    kernel.execute();
+  }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MirrorPadTest, FloatReflect)
+{
+  Shape input_shape = {1, 2, 2, 1};
+  Shape padding_shape = {4, 2};
+
+  std::vector<float> input_data{1.0f, 2.0f,  //
+                                3.0f, 4.0f}; //
+  std::vector<int> padding_data{0, 0, 2, 1, 1, 2, 0, 0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT);
+
+  std::vector<float> ref_output_data{2.0f, 1.0f, 2.0f, 1.0f, 2.0f,  //
+                                     4.0f, 3.0f, 4.0f, 3.0f, 4.0f,  //
+                                     2.0f, 1.0f, 2.0f, 1.0f, 2.0f,  //
+                                     4.0f, 3.0f, 4.0f, 3.0f, 4.0f,  //
+                                     2.0f, 1.0f, 2.0f, 1.0f, 2.0f}; //
+  std::initializer_list<int32_t> ref_output_shape{1, 5, 5, 1};
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MirrorPadTest, FloatSymmetric)
+{
+  Shape input_shape = {1, 2, 2, 1};
+  Shape padding_shape = {4, 2};
+
+  std::vector<float> input_data{1.0f, 2.0f,  //
+                                3.0f, 4.0f}; //
+  std::vector<int> padding_data{0, 0, 2, 1, 1, 2, 0, 0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC);
+
+  std::vector<float> ref_output_data{3.0, 3.0, 4.0, 4.0, 3.0,  //
+                                     1.0, 1.0, 2.0, 2.0, 1.0,  //
+                                     1.0, 1.0, 2.0, 2.0, 1.0,  //
+                                     3.0, 3.0, 4.0, 4.0, 3.0,  //
+                                     3.0, 3.0, 4.0, 4.0, 3.0}; //
+  std::initializer_list<int32_t> ref_output_shape{1, 5, 5, 1};
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MirrorPadTest, FloatSymmetric2Dim)
+{
+  Shape input_shape = {3, 1};
+  Shape padding_shape = {2, 2};
+
+  std::vector<float> input_data{1.0f, 2.0f, 3.0f};
+  std::vector<int> padding_data{1, 2, 0, 0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC);
+
+  std::vector<float> ref_output_data{1.0, 1.0, 2.0, 3.0, 3.0, 2.0};
+  std::initializer_list<int32_t> ref_output_shape{6, 1};
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MirrorPadTest, Uint8Reflect)
+{
+  Shape input_shape = {1, 2, 3, 1};
+  Shape padding_shape = {4, 2};
+
+  float quant_tolerance = getTolerance(0.0f, 6.0f, 255);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(0.0f, 6.0f);
+
+  std::vector<float> input_data{1.0f, 2.0f, 3.0f,  //
+                                4.0f, 5.0f, 6.0f}; //
+  std::vector<int> padding_data{0, 0, 2, 1, 1, 3, 0, 0};
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT);
+
+  std::vector<float> ref_output_data{
+    3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, //
+    6.0f, 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, //
+    3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, //
+    6.0f, 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, //
+    3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, //
+  };
+  std::initializer_list<int32_t> ref_output_shape{1, 5, 7, 1};
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, quant_tolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MirrorPadTest, Uint8Symmetric)
+{
+  Shape input_shape = {1, 2, 3, 1};
+  Shape padding_shape = {4, 2};
+
+  float quant_tolerance = getTolerance(0.0f, 6.0f, 255);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(0.0f, 6.0f);
+
+  std::vector<float> input_data{1.0f, 2.0f, 3.0f,  //
+                                4.0f, 5.0f, 6.0f}; //
+  std::vector<int> padding_data{0, 0, 2, 1, 1, 3, 0, 0};
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC);
+
+  std::vector<float> ref_output_data{
+    4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, //
+    1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 2.0f, 1.0f, //
+    1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 2.0f, 1.0f, //
+    4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, //
+    4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, //
+  };
+  std::initializer_list<int32_t> ref_output_shape{1, 5, 7, 1};
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, quant_tolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(MirrorPadTest, UnsupportedDim_NEG)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 1, 1}, {1.0f}, _memory_manager.get());
+  Tensor padding_tensor =
+    makeInputTensor<DataType::S32>({5, 2}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  EXPECT_ANY_THROW(Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT));
+}
+
+TEST_F(MirrorPadTest, InvalidInputType_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor padding_tensor = makeInputTensor<DataType::S32>({1, 2}, {0, 0}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  EXPECT_ANY_THROW(Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mul.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.cpp
new file mode 100644
index 000000000..531fb4fa1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Mul.h"
+
+#include "kernels/BinaryOpCommon.h"
+#include "kernels/Utils.h"
+
+#include "PALMul.h"
+
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Mul::Mul(const Tensor *input1, const Tensor *input2, Tensor *output, const MulParams &params)
+  : KernelWithParams<MulParams>({input1, input2}, {output}, params)
+{
+}
+
+void Mul::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == input1()->element_type());
+  if (input1()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 &&
+                           input2()->zero_points().size() == 1)
+    LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 &&
+                           output()->zero_point() == 0);
+  }
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Mul::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Mul::evalFloat() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<float>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    luci_interpreter_pal::BroadcastMul4DSlow(
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    luci_interpreter_pal::Mul(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                              getTensorShape(input2()), getTensorData<float>(input2()),
+                              getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+template <typename T> void Mul::evalInteger() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<T>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    luci_interpreter_pal::BroadcastMul4DSlow(
+      params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
+      getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    luci_interpreter_pal::Mul(params, getTensorShape(input1()), getTensorData<T>(input1()),
+                              getTensorShape(input2()), getTensorData<T>(input2()),
+                              getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+void Mul::evalQuantizedS16() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const double real_multiplier = input1_scale * input2_scale / output_scale;
+
+  int32_t output_multiplier;
+  int output_shift;
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  auto fn = [output_multiplier, output_shift, activation_min, activation_max](int16_t input1_val,
+                                                                              int16_t input2_val) {
+    int32_t output = static_cast<int32_t>(input1_val) * static_cast<int32_t>(input2_val);
+    output = tflite::MultiplyByQuantizedMultiplier(output, output_multiplier, output_shift);
+    output = std::max(output, activation_min);
+    output = std::min(output, activation_max);
+    return static_cast<int16_t>(output);
+  };
+
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<int16_t>(input1()),
+                        getTensorShape(input2()), getTensorData<int16_t>(input2()),
+                        getTensorShape(output()), getTensorData<int16_t>(output()), fn);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mul.h b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.h
new file mode 100644
index 000000000..c0cf817df
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_MUL_H
+#define LUCI_INTERPRETER_KERNELS_MUL_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Mul : public KernelWithParams<MulParams>
+{
+public:
+  Mul(const Tensor *input1, const Tensor *input2, Tensor *output, const MulParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantizedS16() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_MUL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Mul.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.test.cpp
new file mode 100644
index 000000000..fc0e60614
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Mul.test.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Mul.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class MulTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MulTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<float>> test_outputs = {
+    {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
+     0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
+     0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
+     0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
+     0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
+    {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                 1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+      << "With shape number " << i;
+  }
+}
+
+template <loco::DataType DType> void checkInteger(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+
+  dtype max_value = std::numeric_limits<dtype>::max();
+  dtype res_max = max_value - max_value % 10;
+
+  std::vector<std::vector<dtype>> test_outputs = {
+    {8,  0, 20,  0, 4,  30,  //
+     16, 0, 40,  3, 8,  0,   //
+     0,  0, 0,   6, 0,  0,   //
+     4,  0, 10,  9, 2,  0,   //
+     40, 0, 100, 0, 20, 150, //
+     28, 0, 70,  0, 14, res_max},
+    {8, 0, 40, 3, 0, 0, 4, 0, 100, 0, 14, res_max},
+    {8,  12,     0, 0, 20, 30, 16, 0, 0, 0,  40, 0,   0,   0, 0, 0,  0,
+     0,  0,      9, 2, 0,  10, 0,  0, 0, 20, 30, 100, 150, 0, 0, 14, max_value / 10 * 2,
+     70, res_max},
+    {8, 12, 0, 0, 0, 0, 0, 9, 20, 30, 70, res_max}};
+  std::vector<dtype> input1_data{2, 3, 4, -1, -3, -2, 1, -3, 10, 15, 7, max_value / 10};
+  std::vector<dtype> input2_data{4, 0, 10, -3, 2, 10};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+}
+
+TEST_F(MulTest, SInt64)
+{
+  checkInteger<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(MulTest, SInt32)
+{
+  checkInteger<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(MulTest, SInt16)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<int32_t>> ref_output_shapes{
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+
+  std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                                 1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::vector<std::vector<float>> ref_outputs = {
+    {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
+     0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
+     0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
+     0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
+     0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
+    {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
+    const float tolerance = output_tensor.scale() * 2;
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+      << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
+      << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs and different scales.
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::S16, 3.0 / 32767, 0);
+    const float tolerance = output_tensor.scale() * 2;
+
+    MulParams params{};
+    params.activation = Activation::RELU;
+
+    Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorShape(output_tensor),
+                ::testing::ElementsAreArray(ref_output_shapes[i]))
+      << "With shape number " << i;
+    EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
+      << "With shape number " << i;
+  }
+}
+
+TEST_F(MulTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  MulParams params{};
+  params.activation = Activation::RELU;
+
+  Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(MulTest, Invalid_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  MulParams params{};
+  params.activation = Activation::RELU;
+
+  Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(MulTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U64);
+
+  MulParams params{};
+  params.activation = Activation::RELU;
+
+  Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(MulTest, Invalid_Quantization_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S16>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S16>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16);
+
+  MulParams params{};
+  params.activation = Activation::NONE;
+
+  Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Neg.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.cpp
new file mode 100644
index 000000000..c6fe08a9e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Neg.h"
+#include "kernels/Utils.h"
+
+#include "PALNeg.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Neg::Neg(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Neg::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  output()->resize(input()->shape());
+}
+
+void Neg::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Neg::evalFloat() const
+{
+  luci_interpreter_pal::Negate(getTensorShape(input()), getTensorData<float>(input()),
+                               getTensorShape(output()), getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Neg.h b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.h
new file mode 100644
index 000000000..69fa1a18e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_NEG_H
+#define LUCI_INTERPRETER_KERNELS_NEG_H
+
+#include "core/Kernel.h"
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Neg : public Kernel
+{
+public:
+  Neg(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_NEG_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Neg.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.test.cpp
new file mode 100644
index 000000000..8b2bc1a82
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Neg.test.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Neg.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T> input_data, std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  Neg kernel(&input_tensor, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(NegTest, FloatSimple)
+{
+  Check<float>(/*input_shape=*/{2, 3},
+               /*output_shape=*/{2, 3},
+               /*input_data=*/
+               {
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -1.0f, -2.0f, // Row 2
+               },
+               /*output_data=*/
+               {
+                 0.0f, -1.0f, -3.0f, // Row 1
+                 -1.0f, 1.0f, 2.0f,  // Row 2
+               });
+
+  SUCCEED();
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.cpp
new file mode 100644
index 000000000..54e5eee34
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/NotEqual.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/comparisons.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+NotEqual::NotEqual(const Tensor *x, const Tensor *y, Tensor *output) : Kernel({x, y}, {output}) {}
+
+void NotEqual::configure()
+{
+  LUCI_INTERPRETER_CHECK(x()->element_type() == y()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::BOOL);
+
+  if (x()->element_type() == DataType::U8)
+  {
+    quantizeMultiplierSmallerThanOneExp(x()->scale(), &_x_multiplier, &_x_shift);
+    quantizeMultiplierSmallerThanOneExp(y()->scale(), &_y_multiplier, &_y_shift);
+  }
+  output()->resize(calculateShapeForBroadcast(x()->shape(), y()->shape()));
+}
+
+void NotEqual::execute() const
+{
+  switch (x()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void NotEqual::evalFloat() const
+{
+  const auto x_data = getTensorData<float>(x());
+  const auto y_data = getTensorData<float>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowNotEqual(op_params, getTensorShape(x()), x_data,
+                                                   getTensorShape(y()), y_data,
+                                                   getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::NotEqual(op_params, getTensorShape(x()), x_data, getTensorShape(y()),
+                                    y_data, getTensorShape(output()), output_data);
+  }
+}
+
+template <typename T> void NotEqual::evalInteger() const
+{
+  const auto x_data = getTensorData<T>(x());
+  const auto y_data = getTensorData<T>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowNotEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                                            getTensorShape(y()), y_data,
+                                                            getTensorShape(output()), output_data);
+  }
+  else
+  {
+    tflite::reference_ops::NotEqualNoScaling(op_params, getTensorShape(x()), x_data,
+                                             getTensorShape(y()), y_data, getTensorShape(output()),
+                                             output_data);
+  }
+}
+
+void NotEqual::evalQuantized() const
+{
+  const auto x_data = getTensorData<uint8_t>(x());
+  const auto y_data = getTensorData<uint8_t>(y());
+  auto output_data = getTensorData<bool>(output());
+
+  tflite::ComparisonParams op_params;
+  op_params.left_shift = 8;
+  op_params.input1_offset = -x()->zero_point(); // Note the '-'
+  op_params.input1_shift = _x_shift;
+  op_params.input1_multiplier = _x_multiplier;
+  op_params.input2_offset = -y()->zero_point(); // Note the '-'
+  op_params.input2_shift = _y_shift;
+  op_params.input2_multiplier = _y_multiplier;
+  op_params.is_broadcast = x()->shape() != y()->shape();
+
+  if (op_params.is_broadcast)
+  {
+    tflite::reference_ops::Broadcast4DSlowNotEqualWithScaling(
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
+  }
+  else
+  {
+    tflite::reference_ops::NotEqualWithScaling(op_params, getTensorShape(x()), x_data,
+                                               getTensorShape(y()), y_data,
+                                               getTensorShape(output()), output_data);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.h b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.h
new file mode 100644
index 000000000..d2aafe893
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
+#define LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class NotEqual : public Kernel
+{
+public:
+  NotEqual(const Tensor *x, const Tensor *y, Tensor *output);
+
+  const Tensor *x() const { return _inputs[0]; }
+  const Tensor *y() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _x_multiplier = 0;
+  int _x_shift = 0;
+  int32_t _y_multiplier = 0;
+  int _y_shift = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_NOT_EQUAL_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.test.cpp
new file mode 100644
index 000000000..45bf4022a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/NotEqual.test.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/NotEqual.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class NotEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(NotEqualTest, FloatSimple)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true, false, true, // Row 1
+    true, false, true, // Row 2
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(NotEqualTest, FloatBroardcast)
+{
+  std::vector<float> x_data{
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+    0.9, 0.7, 0.5, // Row 4
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.7, 0.5, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  false, true,  // Row 1
+    true,  true,  true,  // Row 2
+    true,  true,  true,  // Row 3
+    false, false, false, // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{min_value, 2, max_value};
+
+  std::vector<dtype> y_data{min_value, -2, max_value};
+
+  std::vector<bool> ref_output_data{false, true, false};
+
+  Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3}));
+}
+
+template <loco::DataType DType>
+void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  dtype min_value = std::numeric_limits<dtype>::min();
+  dtype max_value = std::numeric_limits<dtype>::max();
+  std::vector<dtype> x_data{
+    min_value, 2,  3,         // Row 1
+    4,         5,  max_value, // Row 2
+    -1,        -2, -3,        // Row 3
+    min_value, -2, max_value, // Row 4
+  };
+
+  std::vector<dtype> y_data{
+    min_value, -2, max_value, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    false, true,  true,  // Row 1
+    true,  true,  false, // Row 2
+    true,  false, true,  // Row 3
+    false, false, false, // Row 4
+  };
+
+  Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager);
+  Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager);
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3}));
+}
+
+TEST_F(NotEqualTest, Int32)
+{
+  checkIntegerSimple<loco::DataType::S32>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(NotEqualTest, Int64)
+{
+  checkIntegerSimple<loco::DataType::S64>(_memory_manager.get());
+  checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+// Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+const float F_MIN = -128.0 / 128.0;
+const float F_MAX = 127.0 / 128.0;
+
+TEST_F(NotEqualTest, Uint8Quantized)
+{
+  std::vector<float> x_data{
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
+  };
+
+  std::vector<float> y_data{
+    0.9, 0.5, 0.55, 0.5, // Row 1
+    -1,  0,   0.05, 1,   // Row 2
+  };
+
+  std::vector<bool> ref_output_data{
+    true, false, true,  true, // Row 1
+    true, false, false, true, // Row 2
+  };
+
+  std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+
+  std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(NotEqualTest, Uint8QuantizedBroadcast)
+{
+  std::vector<float> x_data{
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+    -1,   0.05, 0,    1,   // Row 4
+  };
+
+  std::vector<float> y_data{
+    -1, 0.05, 0, 1, // Row 1
+  };
+
+  std::vector<bool> ref_output_data{
+    true,  true,  true,  true,  // Row 1
+    true,  true,  false, true,  // Row 2
+    true,  true,  true,  true,  // Row 3
+    false, false, false, false, // Row 4
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 4, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(NotEqualTest, Input_Type_Mismatch_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(NotEqualTest, Input_Output_Type_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(NotEqualTest, Float_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(NotEqualTest, Int32_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(NotEqualTest, Int64_Broadcast_NEG)
+{
+  Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
+  ASSERT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.cpp
new file mode 100644
index 000000000..4d3e5f2ef
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/OneHot.h"
+#include "kernels/Utils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+namespace
+{
+
+template <typename T>
+void OneHotComputeImpl(const Tensor *indices_tensor, const Tensor *on_value_tensor,
+                       const Tensor *off_value_tensor, int32_t depth, int32_t axis,
+                       Tensor *output_tensor)
+{
+  // define input shape and correct axis
+  auto const &input_shape = indices_tensor->shape();
+  axis = axis == -1 ? input_shape.num_dims() : axis;
+
+  // TODO support other integer input types
+  auto const *indices = getTensorData<int32_t>(indices_tensor);
+  auto const on_value = getTensorData<T>(on_value_tensor)[0];
+  auto const off_value = getTensorData<T>(off_value_tensor)[0];
+  auto *output = getTensorData<T>(output_tensor);
+
+  // prefix_dim_size == # of elements before the axis
+  // depth == # of elements per axis
+  // suffix_dim_size == # of elements after the axis
+  auto prefix_dim_size = 1;
+  for (int32_t i = 0; i < axis; ++i)
+  {
+    prefix_dim_size *= input_shape.dim(i);
+  }
+  assert(prefix_dim_size > 0);
+  auto const suffix_dim_size = input_shape.num_elements() / prefix_dim_size;
+
+  // View the indices as a matrix of size:
+  //     prefix_dim_size x suffix_dim_size
+  // View the output as a matrix of size:
+  //     prefix_dim_size x depth x suffix_dim_size
+  // Then the output is:
+  //     output(i, j, k) == (indices(i, k) == j) ? on : off
+  for (int32_t i = 0; i < prefix_dim_size; ++i)
+    for (int32_t j = 0; j < depth; ++j)
+      for (int32_t k = 0; k < suffix_dim_size; ++k, ++output)
+        *output = indices[i * suffix_dim_size + k] == j ? on_value : off_value;
+}
+
+} // namespace
+
+OneHot::OneHot(const Tensor *indices, const Tensor *depth, const Tensor *on_value,
+               const Tensor *off_value, Tensor *output, const OneHotParams &params)
+  : KernelWithParams<OneHotParams>({indices, depth, on_value, off_value}, {output}, params)
+{
+  // Do nothing
+}
+
+void OneHot::configure()
+{
+  // check types
+  LUCI_INTERPRETER_CHECK(indices()->element_type() == DataType::S32);
+  LUCI_INTERPRETER_CHECK(depth()->element_type() == DataType::S32);
+  LUCI_INTERPRETER_CHECK(on_value()->element_type() == off_value()->element_type());
+  LUCI_INTERPRETER_CHECK(output()->element_type() == on_value()->element_type());
+
+  // check shape dependent parameters
+  LUCI_INTERPRETER_CHECK(on_value()->shape().num_elements() == 1);
+  LUCI_INTERPRETER_CHECK(off_value()->shape().num_elements() == 1);
+  LUCI_INTERPRETER_CHECK(depth()->shape().num_elements() == 1);
+  LUCI_INTERPRETER_CHECK(params().axis >= -1 && params().axis <= indices()->shape().num_dims());
+
+  // define parameters that affect the output shape
+  auto const depth_value = getTensorData<int32_t>(depth())[0];
+  auto const &input_shape = indices()->shape();
+  auto const input_dims = input_shape.num_dims();
+  auto const axis = params().axis == -1 ? input_dims : params().axis;
+
+  // define output shape
+  Shape output_shape(input_shape.num_dims() + 1);
+  {
+    for (int32_t d = 0; d < axis; ++d)
+      output_shape.dim(d) = input_shape.dim(d);
+
+    output_shape.dim(axis) = depth_value;
+
+    for (int32_t d = axis + 1; d < output_shape.num_dims(); ++d)
+      output_shape.dim(d) = input_shape.dim(d - 1);
+  }
+
+  // reshape output
+  output()->resize(output_shape);
+}
+
+void OneHot::execute() const
+{
+  auto const depth_value = getTensorData<int32_t>(depth())[0];
+  auto const axis = params().axis;
+
+  switch (output()->element_type())
+  {
+    case loco::DataType::FLOAT32:
+      OneHotComputeImpl<float>(indices(), on_value(), off_value(), depth_value, axis, output());
+      break;
+    case loco::DataType::U8:
+      OneHotComputeImpl<uint8_t>(indices(), on_value(), off_value(), depth_value, axis, output());
+      break;
+    case loco::DataType::S16:
+      OneHotComputeImpl<int16_t>(indices(), on_value(), off_value(), depth_value, axis, output());
+      break;
+    default:
+      // TODO Support other data types
+      throw std::runtime_error("Not supported, yet!");
+      break;
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.h b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.h
new file mode 100644
index 000000000..572f857ae
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_ONEHOT_H
+#define LUCI_INTERPRETER_KERNELS_ONEHOT_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class OneHot : public KernelWithParams<OneHotParams>
+{
+public:
+  OneHot(const Tensor *indices, const Tensor *depth, const Tensor *on_value,
+         const Tensor *off_value, Tensor *output, const OneHotParams &params);
+
+  const Tensor *indices() const { return _inputs[0]; }
+  const Tensor *depth() const { return _inputs[1]; }
+  const Tensor *on_value() const { return _inputs[2]; }
+  const Tensor *off_value() const { return _inputs[3]; }
+
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_ONEHOT_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.test.cpp
new file mode 100644
index 000000000..45b6968fa
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/OneHot.test.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/OneHot.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T1, typename T2>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T1> input_data, std::initializer_list<int32_t> depth_data,
+           std::initializer_list<T2> on_value_data, std::initializer_list<T2> off_value_data,
+           int32_t axis, std::initializer_list<T2> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  constexpr auto input_type = getElementType<T1>();
+  constexpr auto output_type = getElementType<T2>();
+
+  Tensor input_tensor = makeInputTensor<input_type>(input_shape, input_data, memory_manager.get());
+  Tensor depth_tensor = makeInputTensor<DataType::S32>({}, depth_data, memory_manager.get());
+  Tensor on_value_tensor = makeInputTensor<output_type>({}, on_value_data, memory_manager.get());
+  Tensor off_value_tensor = makeInputTensor<output_type>({}, off_value_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(output_type);
+
+  OneHotParams params{};
+  params.axis = axis;
+
+  OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor,
+                params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+  EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data));
+}
+
+template <typename T> class OneHotTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int16_t>;
+TYPED_TEST_SUITE(OneHotTest, DataTypes);
+
+TYPED_TEST(OneHotTest, BasicPattern)
+{
+  // axis 0
+  Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{4, 2, 3},
+                            /*input_data=*/
+                            {
+                              0, 3, 5, //
+                              7, 3, 0, //
+                            },
+                            /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0},
+                            /*axis=*/0,
+                            /*output_data=*/
+                            {
+                              1, 0, 0, //
+                              0, 0, 1, //
+
+                              0, 0, 0, //
+                              0, 0, 0, //
+
+                              0, 0, 0, //
+                              0, 0, 0, //
+
+                              0, 1, 0, //
+                              0, 1, 0, //
+                            });
+  // axis 1
+  Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 4, 3},
+                            /*input_data=*/
+                            {
+                              0, 3, 5, //
+                              7, 3, 0, //
+                            },
+                            /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0},
+                            /*axis=*/1,
+                            /*output_data=*/
+                            {
+                              1, 0, 0, //
+                              0, 0, 0, //
+                              0, 0, 0, //
+                              0, 1, 0, //
+
+                              0, 0, 1, //
+                              0, 0, 0, //
+                              0, 0, 0, //
+                              0, 1, 0, //
+                            });
+  // axis -1
+  Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3, 4},
+                            /*input_data=*/
+                            {
+                              0, 3, 5, //
+                              7, 3, 0, //
+                            },
+                            /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0},
+                            /*axis=*/-1,
+                            /*output_data=*/
+                            {
+                              1, 0, 0, 0, //
+                              0, 0, 0, 1, //
+                              0, 0, 0, 0, //
+
+                              0, 0, 0, 0, //
+                              0, 0, 0, 1, //
+                              1, 0, 0, 0, //
+                            });
+}
+
+TEST(OneHotTest, UnsupportedInputType_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  // input type should be integer
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, memory_manager.get());
+
+  Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get());
+  Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get());
+  Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  OneHotParams params = {-1};
+
+  OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor,
+                params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(OneHotTest, OutputTypeMismatch_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S32>({1}, {0}, memory_manager.get());
+  Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get());
+
+  // type of on_value, off_value and output_tensor should be same
+  Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get());
+  Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16);
+
+  OneHotParams params = {-1};
+
+  OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor,
+                params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(OneHotTest, InvalidAxis_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S32>({1}, {0}, memory_manager.get());
+  Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get());
+  Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get());
+  Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  // axis should be in [-1, input_shape.rank]
+  OneHotParams params = {-2};
+
+  OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor,
+                params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.cpp
new file mode 100644
index 000000000..5a6b05c3a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/PRelu.h"
+
+#include "kernels/BinaryOpCommon.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/binary_function.h>
+#include <tensorflow/lite/kernels/internal/reference/prelu.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+PRelu::PRelu(const Tensor *input, const Tensor *alpha, Tensor *output)
+  : Kernel({input, alpha}, {output})
+{
+}
+
+PRelu::~PRelu()
+{
+  // Destructor declared to delete vector of alpha quantized data properly
+}
+
+void PRelu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(alpha()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->scales().size() <= 1);
+  LUCI_INTERPRETER_CHECK(output()->scales().size() <= 1);
+
+  if (input()->element_type() == DataType::U8)
+  {
+    LUCI_INTERPRETER_CHECK(alpha()->scales().size() <= 1); // remove when CWQ kernel arrives
+    _alpha_multipliers.resize(1);
+    double alpha_multiplier = input()->scale() * alpha()->scale() / output()->scale();
+    quantizeMultiplier(alpha_multiplier, &_alpha_multipliers[0].multiplier,
+                       &_alpha_multipliers[0].shift);
+    double identity_multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    // Common check for correctness of quant params
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+    for (size_t channel = 0; channel < alpha()->zero_points().size(); ++channel)
+    {
+      LUCI_INTERPRETER_CHECK(alpha()->zero_points()[channel] == 0);
+    }
+    // PRelu specific checks for CWQ
+    LUCI_INTERPRETER_CHECK(alpha()->quantized_dimension() == alpha()->shape().num_dims() - 1);
+    LUCI_INTERPRETER_CHECK(static_cast<int32_t>(alpha()->scales().size()) ==
+                           alpha()->shape().dim(alpha()->quantized_dimension()));
+    LUCI_INTERPRETER_CHECK(alpha()->shape().num_elements() ==
+                           input()->shape().dim(input()->shape().num_dims() - 1));
+
+    // all dimension of alpha except last one should be size 1
+    for (int dim = 0; dim < alpha()->shape().num_dims() - 1; ++dim)
+    {
+      LUCI_INTERPRETER_CHECK(alpha()->shape().dim(dim) == 1);
+    }
+
+    std::vector<double> real_multipliers =
+      getQuantizedConvolutionMultiplers(input()->scale(), alpha()->scales(), output()->scale());
+
+    _alpha_multipliers = quantizeMultipliers(real_multipliers);
+
+    double identity_multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
+  }
+  output()->resize(calculateShapeForBroadcast(input()->shape(), alpha()->shape()));
+}
+
+void PRelu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void PRelu::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto alpha_data = getTensorData<float>(alpha());
+  const auto size = getTensorShape(input()).FlatSize();
+  auto output_data = getTensorData<float>(output());
+
+  auto PReluFunc = [](float input, float alpha) { return input >= 0.0 ? input : input * alpha; };
+
+  if (input()->shape() != alpha()->shape())
+  {
+    tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+      getTensorShape(input()), getTensorData<float>(input()), getTensorShape(alpha()),
+      getTensorData<float>(alpha()), getTensorShape(output()), getTensorData<float>(output()),
+      PReluFunc);
+  }
+  else
+  {
+    for (auto i = decltype(size){0}; i < size; ++i)
+    {
+      if (input_data[i] >= 0)
+        output_data[i] = input_data[i];
+      else
+        output_data[i] = input_data[i] * alpha_data[i];
+    }
+  }
+}
+
+void PRelu::evalQuantized() const
+{
+  tflite::PreluParams op_params{};
+
+  op_params.input_offset = -input()->zero_point(); // Note the '-'.
+  op_params.alpha_offset = -alpha()->zero_point(); // Note the '-'.
+  op_params.output_offset = output()->zero_point();
+  op_params.output_shift_1 = _output_shift_identity;
+  op_params.output_multiplier_1 = _output_multiplier_identity;
+  op_params.output_shift_2 = _alpha_multipliers[0].shift;
+  op_params.output_multiplier_2 = _alpha_multipliers[0].multiplier;
+
+  if (input()->shape() != alpha()->shape())
+  {
+    tflite::reference_ops::BroadcastPrelu4DSlow(
+      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(alpha()),
+      getTensorData<uint8_t>(alpha()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Prelu<uint8_t>(
+      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(alpha()),
+      getTensorData<uint8_t>(alpha()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+static inline int16_t evalElemS16PRelu(int16_t input_val, int16_t alpha_val,
+                                       const ChannelQuantMultipliers &identity_mult,
+                                       const ChannelQuantMultipliers &alpha_mult)
+{
+  constexpr int32_t quantized_min = std::numeric_limits<int16_t>::min();
+  constexpr int32_t quantized_max = std::numeric_limits<int16_t>::max();
+
+  const int32_t output_val =
+    input_val >= 0
+      ? tflite::MultiplyByQuantizedMultiplier(static_cast<int32_t>(input_val),
+                                              identity_mult.multiplier, identity_mult.shift)
+      : tflite::MultiplyByQuantizedMultiplier(static_cast<int32_t>(input_val * alpha_val),
+                                              alpha_mult.multiplier, alpha_mult.shift);
+  const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_val));
+  return clamped_output;
+}
+
+void PRelu::evalQuantizedS16() const
+{
+  // Note that this kernel assumes alpha is CWQ
+  tflite::RuntimeShape input_shape = getTensorShape(input());
+  const int16_t *input_data = input()->data<int16_t>();
+  const int16_t *alpha_data = alpha()->data<int16_t>();
+  int16_t *output_data = output()->data<int16_t>();
+
+  const ChannelQuantMultipliers pos_mult{_output_shift_identity, _output_multiplier_identity};
+
+  const int last_dim = input()->shape().num_dims() - 1;
+
+  int32_t outer_dims_size = 1;
+  for (int i = 0; i < last_dim; ++i)
+    outer_dims_size *= input_shape.Dims(i);
+  int32_t quant_dim_size = input_shape.Dims(last_dim);
+
+  for (int32_t outer_dims = 0; outer_dims < outer_dims_size; ++outer_dims)
+    for (int32_t quant_channel = 0; quant_channel < quant_dim_size; ++quant_channel)
+    {
+      const ChannelQuantMultipliers &neg_mult = _alpha_multipliers[quant_channel];
+      size_t offset = static_cast<size_t>(outer_dims) * static_cast<size_t>(quant_dim_size);
+      offset += quant_channel;
+
+      output_data[offset] =
+        evalElemS16PRelu(input_data[offset], alpha_data[quant_channel], pos_mult, neg_mult);
+    }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.h b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.h
new file mode 100644
index 000000000..f7735d418
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PRELU_H
+#define LUCI_INTERPRETER_KERNELS_PRELU_H
+
+#include "core/Kernel.h"
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ChannelQuantMultipliers;
+
+class PRelu : public Kernel
+{
+public:
+  PRelu(const Tensor *input, const Tensor *alpha, Tensor *output);
+
+  ~PRelu();
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *alpha() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+
+private:
+  std::vector<ChannelQuantMultipliers> _alpha_multipliers;
+  // TODO merge this into one ChannelQuantMultiplier object
+  int32_t _output_multiplier_identity = 0;
+  int _output_shift_identity = 0;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PRELU_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.test.cpp
new file mode 100644
index 000000000..6d97382de
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PRelu.test.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/PRelu.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> alpha_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
+           std::initializer_list<T> alpha_data, std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<element_type>(alpha_shape, alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(PReluTest, FloatSimple)
+{
+  Check<float>(/*input_shape=*/{2, 3}, /*alpha_shape=*/{2, 3},
+               /*output_shape=*/{2, 3},
+               /*input_data=*/
+               {
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -1.0f, -2.0f, // Row 2
+               },
+               /*alpha_data=*/
+               {
+                 0.0f, 0.5f, 0.1f, // Row 1
+                 0.0f, 0.5f, 0.1f, // Row 2
+               },
+               /*output_data=*/
+               {
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -0.5f, -0.2f, // Row 2
+               });
+
+  SUCCEED();
+}
+
+TEST(PReluTest, FloatBroadcast)
+{
+  Check<float>(/*input_shape=*/{1, 2, 2, 3}, /*alpha_shape=*/{1, 1, 3},
+               /*output_shape=*/{1, 2, 2, 3},
+               /*input_data=*/
+               {
+                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                 -1.0f, -1.0f, -1.0f, // Row 2, Column 1
+                 -2.0f, -2.0f, -2.0f, // Row 2, Column 2
+               },
+               /*alpha_data=*/
+               {0.0f, 1.0f, 2.0f},
+               /*output_data=*/
+               {
+                 0.0f, 0.0f, 0.0f,   // Row 1, Column 1
+                 1.0f, 1.0f, 1.0f,   // Row 1, Column 2
+                 0.0f, -1.0f, -2.0f, // Row 2, Column 1
+                 0.0f, -2.0f, -4.0f, // Row 2, Column 2
+               });
+
+  SUCCEED();
+}
+
+float GetTolerance(float min, float max) { return (max - min) / 255.0; }
+
+TEST(PReluTest, Uint8Simple)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, 0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.5f, 0.5f, 0.25f, 1.0f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, 0.7f, 0.1f, -0.1f};
+
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 3, 1}));
+
+  SUCCEED();
+}
+
+TEST(PReluTest, Uint8Broadcast)
+{
+  std::vector<float> input_data{
+    0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+    0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+    -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+    -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+  };
+  std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
+  std::vector<float> ref_output_data{
+    0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+    0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+    0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+    0.0f, -0.125f, 0.125f // Row 2, Column 2
+  };
+  std::vector<float> ref_quant_output_data{
+    128, 128, 128, // Row 1, Column 1
+    192, 192, 192, // Row 1, Column 2
+    128, 64,  192, // Row 2, Column 1
+    128, 112, 144  // Row 2, Column 2
+  };
+  float kQuantizedTolerance = 2 * (1. / 256);
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
+
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 3}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 3}, quant_param.first, quant_param.second, alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 3}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_quant_output_data));
+}
+
+TEST(PReluTest, SInt16_LWQ_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  // Rewrite this test in case layer-wise quantization for sint16 is supported
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, SInt16_CWQ_Simple)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
+
+  std::vector<float> alpha_scales{0.05f, 0.025f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2}, alpha_scales, zerop, 0, alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PReluTest, SInt16_CWQ_spatial_alpha_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
+
+  std::vector<float> alpha_scales{0.25f, 0.05f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, alpha_scales, zerop, 3,
+                                                       alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, SInt16_CWQ_wrong_dim_quant_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
+
+  std::vector<float> alpha_scales{0.25f};
+  std::vector<int32_t> zerop{0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 1, 2}, alpha_scales, zerop, 1,
+                                                       alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, SInt16_CWQ_uneven_shape1)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
+
+  std::vector<float> alpha_scales{0.05f, 0.025f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 2}, alpha_scales, zerop, 2,
+                                                       alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PReluTest, SInt16_CWQ_uneven_shape2)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{
+    0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+    0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+    -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+    -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+  };
+  std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
+  std::vector<float> ref_output_data{
+    0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+    0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+    0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+    0.0f, -0.125f, 0.125f // Row 2, Column 2
+  };
+
+  std::vector<float> alpha_scales{1.f, 0.05f, 0.1f};
+  std::vector<int32_t> zerop{0, 0, 0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 2, 3}, 0.01, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 1, 3}, alpha_scales, zerop, 3,
+                                                       alpha_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.001, 0);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 3}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PReluTest, Input_Output_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, Input_Alpha_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1}, {1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, Invalid_Input_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST(PReluTest, Input_Output_U8_CWQ_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> scales{1.f, 1.f};
+  std::vector<int32_t> zerop{0, 0};
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, Input_Output_S16_CWQ_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> scales{1.f, 1.f};
+  std::vector<int32_t> zerop{0, 0};
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PReluTest, Mixing_U8_S16_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
+
+  PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pack.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.cpp
new file mode 100644
index 000000000..42aab330c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pack.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Pack::Pack(std::vector<const Tensor *> inputs, Tensor *output, const PackParams &params)
+  : KernelWithParams<PackParams>(std::move(inputs), {output}, params)
+{
+}
+
+void Pack::configure()
+{
+  LUCI_INTERPRETER_CHECK(_inputs.size() == static_cast<uint32_t>(params().values_count));
+  const Tensor *t0 = _inputs[0];
+  const int dimension_size = t0->shape().num_dims() + 1;
+  int axis = params().axis;
+  if (axis < 0)
+  {
+    axis += dimension_size;
+  }
+  LUCI_INTERPRETER_CHECK(axis >= 0 && axis <= t0->shape().num_dims());
+
+  if (t0->element_type() != DataType::S32 && t0->element_type() != DataType::FLOAT32 &&
+      t0->element_type() != DataType::U8 && t0->element_type() != DataType::S8 &&
+      t0->element_type() != DataType::S16 && t0->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+
+  for (uint32_t i = 1; i < _inputs.size(); ++i)
+  {
+    const Tensor *tensor = _inputs[i];
+    LUCI_INTERPRETER_CHECK(tensor->element_type() == t0->element_type());
+    LUCI_INTERPRETER_CHECK(tensor->shape().num_dims() == t0->shape().num_dims());
+    for (int d = 0; d < t0->shape().num_dims(); ++d)
+    {
+      LUCI_INTERPRETER_CHECK(tensor->shape().dim(d) == t0->shape().dim(d));
+    }
+  }
+
+  Shape output_shape(dimension_size);
+  int i = 0;
+  for (int index = 0; index < dimension_size; ++index)
+  {
+    if (index == axis)
+    {
+      output_shape.dim(index) = params().values_count;
+    }
+    else
+    {
+      output_shape.dim(index) = t0->shape().dim(i++);
+    }
+  }
+
+  if (t0->element_type() == DataType::U8 || t0->element_type() == DataType::S8 ||
+      t0->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == t0->zero_point());
+    LUCI_INTERPRETER_CHECK(output()->scale() == t0->scale());
+    // Guarantee input/output quantization params match as we do not support
+    // packing quantized tensors.
+    for (int i = 0; i < params().values_count; i++)
+    {
+      LUCI_INTERPRETER_CHECK(_inputs[i]->zero_point() == t0->zero_point());
+      LUCI_INTERPRETER_CHECK(_inputs[i]->scale() == t0->scale());
+    }
+  }
+
+  output()->resize(output_shape);
+}
+
+void Pack::execute() const
+{
+  switch (_inputs[0]->element_type())
+  {
+    case DataType::FLOAT32:
+      evalGeneric<float>();
+      break;
+    case DataType::U8:
+      evalGeneric<uint8_t>();
+      break;
+    case DataType::S8:
+      evalGeneric<int8_t>();
+      break;
+    case DataType::S16:
+      evalGeneric<int16_t>();
+      break;
+    case DataType::S32:
+      evalGeneric<int32_t>();
+      break;
+    case DataType::S64:
+      evalGeneric<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void Pack::evalGeneric() const
+{
+  const Tensor *t0 = _inputs[0];
+  const int dimension_size = t0->shape().num_dims() + 1;
+  int axis = params().axis;
+  if (axis < 0)
+  {
+    axis += dimension_size;
+  }
+
+  VectorOfTensors<T, true> inputs(_inputs);
+  tflite::PackParams params{};
+  params.axis = axis;
+  params.inputs_count = _inputs.size();
+  tflite::reference_ops::Pack<T>(params, inputs.shapes(), inputs.data(), getTensorShape(output()),
+                                 getTensorData<T>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pack.h b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.h
new file mode 100644
index 000000000..4a2fcfd80
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PACK_H
+#define LUCI_INTERPRETER_KERNELS_PACK_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Pack : public KernelWithParams<PackParams>
+{
+public:
+  Pack(std::vector<const Tensor *> inputs, Tensor *output, const PackParams &params);
+
+  const Tensor *input(int index) const { return _inputs[index]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalGeneric() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PACK_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.test.cpp
new file mode 100644
index 000000000..d16320b78
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pack.test.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pack.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
+           std::initializer_list<int32_t> output_shape, std::vector<std::vector<T>> input_datas,
+           std::initializer_list<T> output_data, int32_t axis)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  std::vector<const Tensor *> inputs(input_datas.size());
+  std::vector<Tensor> tmp_inputs;
+  for (int i = 0; i < input_datas.size(); i++)
+  {
+    if (std::is_same<T, float>::value || std::is_same<T, int32_t>::value ||
+        std::is_same<T, int64_t>::value)
+    {
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
+    else if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
+    {
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f / 255}, {128}}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
+    else
+    {
+      assert((std::is_same<T, int16_t>::value) && "unexpected dtype is tested");
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f}, {0}}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
+  }
+  for (int i = 0; i < input_datas.size(); i++)
+  {
+    inputs[i] = &tmp_inputs[i];
+  }
+
+  Tensor output_tensor = makeOutputTensor(element_type);
+  if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
+  {
+    output_tensor = makeOutputTensor(element_type, 1.0f / 255, 128);
+  }
+  else if (std::is_same<T, int16_t>::value)
+  {
+    output_tensor = makeOutputTensor(element_type, 1.0f, 0);
+  }
+
+  PackParams params{};
+  params.axis = axis;
+  params.values_count = input_datas.size();
+  Pack kernel(inputs, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class PackTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<uint8_t, int8_t, int16_t, int32_t, int64_t, float>;
+TYPED_TEST_SUITE(PackTest, DataTypes);
+
+TYPED_TEST(PackTest, ThreeInputs)
+{
+  Check<TypeParam>(/*input_shapes=*/{{2}, {2}, {2}},
+                   /*output_shape=*/{3, 2},
+                   /*input_datas=*/
+                   {{1, 4}, {2, 5}, {3, 6}},
+                   /*output_data=*/
+                   {1, 4, 2, 5, 3, 6}, /*axis=*/0);
+
+  SUCCEED();
+}
+
+TYPED_TEST(PackTest, NegAxis)
+{
+  Check<TypeParam>(/*input_shapes=*/{{2}, {2}, {2}},
+                   /*output_shape=*/{2, 3},
+                   /*input_datas=*/
+                   {{1, 4}, {2, 5}, {3, 6}},
+                   /*output_data=*/
+                   {1, 2, 3, 4, 5, 6}, /*axis=*/-1);
+
+  SUCCEED();
+}
+
+TEST(Pack, MismatchingInputValuesCount_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input1_data{1, 4};
+  std::vector<float> input2_data{2, 5};
+  std::vector<float> input3_data{3, 6};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data, memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data, memory_manager.get());
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  PackParams params{};
+  {
+    params.axis = 0;
+    params.values_count = 2;
+
+    Pack kernel({&input1_tensor, &input2_tensor, &input3_tensor}, &output_tensor, params);
+    EXPECT_ANY_THROW(kernel.configure());
+  }
+}
+
+TEST(Pack, InvalidInputAxis_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input1_data{1, 4};
+  std::vector<float> input2_data{2, 5};
+  std::vector<float> input3_data{3, 6};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data, memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data, memory_manager.get());
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  PackParams params{};
+  {
+    params.axis = 2;
+    params.values_count = 3;
+
+    Pack kernel({&input1_tensor, &input2_tensor, &input3_tensor}, &output_tensor, params);
+    EXPECT_ANY_THROW(kernel.configure());
+  }
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.cpp
new file mode 100644
index 000000000..c07f6e310
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pad.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/pad.h>
+
+#include <limits>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Pad::Pad(const Tensor *input, const Tensor *paddings, Tensor *output)
+  : Kernel({input, paddings}, {output})
+{
+}
+
+void Pad::configure()
+{
+  const Shape &input_shape = input()->shape();
+  const int num_dims = input_shape.num_dims();
+
+  if (num_dims > 4)
+    throw std::runtime_error("Unsupported number of dimensions.");
+
+  assert(output()->element_type() == input()->element_type());
+  assert(paddings()->element_type() == DataType::S32);
+  // Paddings shape should be [N, 2].
+  assert(paddings()->shape().num_dims() == 2);
+  assert(paddings()->shape().dim(0) == num_dims);
+  assert(paddings()->shape().dim(1) == 2);
+
+  Shape output_shape(num_dims);
+  const auto *paddings_data = getTensorData<int32_t>(paddings());
+  for (int i = 0; i < num_dims; ++i)
+  {
+    const int32_t padding_before = paddings_data[i * 2];
+    const int32_t padding_after = paddings_data[i * 2 + 1];
+    assert(padding_before >= 0 && padding_after >= 0);
+    output_shape.dim(i) = input_shape.dim(i) + padding_before + padding_after;
+  }
+
+  output()->resize(output_shape);
+}
+
+void Pad::execute() const
+{
+  const int num_dims = input()->shape().num_dims();
+
+  tflite::PadParams params{};
+  params.left_padding_count = num_dims;
+  params.right_padding_count = num_dims;
+
+  const auto *paddings_data = getTensorData<int32_t>(paddings());
+  for (int i = num_dims - 1; i >= 0; --i)
+  {
+    params.left_padding[i] = paddings_data[i * 2];
+    params.right_padding[i] = paddings_data[i * 2 + 1];
+  }
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+    {
+      const float pad_value = 0.0f;
+      tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<float>(input()),
+                                 &pad_value, getTensorShape(output()),
+                                 getTensorData<float>(output()));
+      break;
+    }
+    case DataType::U8:
+    {
+      assert(output()->zero_point() >= std::numeric_limits<uint8_t>::min());
+      assert(output()->zero_point() <= std::numeric_limits<uint8_t>::max());
+      const auto pad_value = static_cast<uint8_t>(output()->zero_point());
+      tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                                 &pad_value, getTensorShape(output()),
+                                 getTensorData<uint8_t>(output()));
+      break;
+    }
+    case DataType::S8:
+    {
+      assert(output()->zero_point() >= std::numeric_limits<int8_t>::min());
+      assert(output()->zero_point() <= std::numeric_limits<int8_t>::max());
+      const auto pad_value = static_cast<int8_t>(output()->zero_point());
+      tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<int8_t>(input()),
+                                 &pad_value, getTensorShape(output()),
+                                 getTensorData<int8_t>(output()));
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pad.h b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.h
new file mode 100644
index 000000000..e05b47f29
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PAD_H
+#define LUCI_INTERPRETER_KERNELS_PAD_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Pad : public Kernel
+{
+public:
+  Pad(const Tensor *input, const Tensor *paddings, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *paddings() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PAD_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pad.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.test.cpp
new file mode 100644
index 000000000..dd3ce947c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pad.test.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pad.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+float GetTolerance(float min, float max) { return (max - min) / 255.0; }
+
+TEST(Pad, Uint8)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+  std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
+  std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                                     0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(Pad, Int8)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::pair<float, int32_t> quant_param = quantizationParams<int8_t>(-1.0f, 1.0f);
+  std::vector<float> input_data{-0.2, 0.4, 0.5, -0.7, -0.1, -0.9, 0.7, 0.1, 0.2};
+  std::vector<int32_t> paddings_data{0, 0, 1, 2, 2, 1, 0, 0};
+  Tensor input_tensor = makeInputTensor<DataType::S8>(
+    {1, 3, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, quant_param.first, quant_param.second);
+
+  Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0, 0, 0,    0,    0,    0, 0, 0, -0.2, 0.4, 0.5, 0,
+                                     0, 0, -0.7, -0.1, -0.9, 0, 0, 0, 0.7,  0.1, 0.2, 0,
+                                     0, 0, 0,    0,    0,    0, 0, 0, 0,    0,   0,   0};
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 6, 6, 1}));
+}
+
+TEST(Pad, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6};
+  std::vector<int32_t> paddings_data{1, 0, 0, 2, 0, 3, 0, 0};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 4, 5,
+                                     6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::initializer_list<int32_t> ref_output_shape{2, 4, 6, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.cpp
new file mode 100644
index 000000000..197cdaa69
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/PadV2.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/pad.h>
+
+#include <limits>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+PadV2::PadV2(const Tensor *input, const Tensor *paddings, const Tensor *constant_values,
+             Tensor *output)
+  : Kernel({input, paddings, constant_values}, {output})
+{
+}
+
+void PadV2::configure()
+{
+  const Shape &input_shape = input()->shape();
+  const int num_dims = input_shape.num_dims();
+
+  if (num_dims > 4)
+    throw std::runtime_error("Unsupported number of dimensions.");
+
+  assert(output()->element_type() == input()->element_type());
+  assert(paddings()->element_type() == DataType::S32);
+  assert(constant_values()->element_type() == output()->element_type());
+  // Paddings shape should be [N, 2].
+  assert(paddings()->shape().num_dims() == 2);
+  assert(paddings()->shape().dim(0) == num_dims);
+  assert(paddings()->shape().dim(1) == 2);
+  // Constant values elements number should be 1.
+  assert(constant_values()->shape().num_elements() == 1);
+
+  Shape output_shape(num_dims);
+  const auto *paddings_data = getTensorData<int32_t>(paddings());
+  for (int i = 0; i < num_dims; ++i)
+  {
+    const int32_t padding_before = paddings_data[i * 2];
+    const int32_t padding_after = paddings_data[i * 2 + 1];
+    assert(padding_before >= 0 && padding_after >= 0);
+    output_shape.dim(i) = input_shape.dim(i) + padding_before + padding_after;
+  }
+
+  output()->resize(output_shape);
+}
+
+void PadV2::execute() const
+{
+  const int num_dims = input()->shape().num_dims();
+
+  tflite::PadParams params{};
+  params.left_padding_count = num_dims;
+  params.right_padding_count = num_dims;
+
+  const auto *paddings_data = getTensorData<int32_t>(paddings());
+  for (int i = num_dims - 1; i >= 0; --i)
+  {
+    params.left_padding[i] = paddings_data[i * 2];
+    params.right_padding[i] = paddings_data[i * 2 + 1];
+  }
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+    {
+      const auto pad_value = getTensorData<float>(constant_values())[0];
+      tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<float>(input()),
+                                 &pad_value, getTensorShape(output()),
+                                 getTensorData<float>(output()));
+      break;
+    }
+    case DataType::U8:
+    {
+      assert(output()->zero_point() >= std::numeric_limits<uint8_t>::min());
+      assert(output()->zero_point() <= std::numeric_limits<uint8_t>::max());
+      const auto pad_value = getTensorData<uint8_t>(constant_values())[0];
+      tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                                 &pad_value, getTensorShape(output()),
+                                 getTensorData<uint8_t>(output()));
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.h b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.h
new file mode 100644
index 000000000..48a31f584
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PAD_V2_H
+#define LUCI_INTERPRETER_KERNELS_PAD_V2_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class PadV2 : public Kernel
+{
+public:
+  PadV2(const Tensor *input, const Tensor *paddings, const Tensor *constant_values, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *paddings() const { return _inputs[1]; }
+  const Tensor *constant_values() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PAD_V2_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.test.cpp
new file mode 100644
index 000000000..41efaff06
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/PadV2.test.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/PadV2.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+float GetTolerance(float min, float max) { return (max - min) / 255.0; }
+
+TEST(PadV2, Uint8)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
+  std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
+  std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
+  std::vector<float> constant_values_data{0.5};
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor constant_values = makeInputTensor<DataType::U8>(
+    {1}, quant_param.first, quant_param.second, constant_values_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  PadV2 kernel(&input_tensor, &paddings_tensor, &constant_values, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data = {
+    0.5, -0.8, 0.2, 0.9, 0.5, 0.5, 0.5, 0.5, 0.7, 0.1, -0.3, 0.5, 0.5, 0.5,  //
+    0.5, 0.5,  0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,  0.5, 0.5, 0.5}; //
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, kQuantizedTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadV2, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6};
+  std::vector<int32_t> paddings_data{1, 0, 0, 2, 0, 3, 0, 0};
+  std::vector<float> constant_values_data{7};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor constant_values =
+    makeInputTensor<DataType::FLOAT32>({1}, constant_values_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  PadV2 kernel(&input_tensor, &paddings_tensor, &constant_values, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                                     7, 7, 7, 7, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 4, 5,
+                                     6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+  std::initializer_list<int32_t> ref_output_shape{2, 4, 6, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pow.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.cpp
new file mode 100644
index 000000000..722c64024
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pow.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Pow::Pow(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void Pow::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
+
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Pow::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      eval<float>();
+      break;
+    case DataType::S32:
+      eval<int32_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void Pow::eval() const
+{
+  tflite::ArithmeticParams params{};
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastPow4DSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                                              getTensorShape(input2()), getTensorData<T>(input2()),
+                                              getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Pow(getTensorShape(input1()), getTensorData<T>(input1()),
+                               getTensorShape(input2()), getTensorData<T>(input2()),
+                               getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pow.h b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.h
new file mode 100644
index 000000000..8ff865e40
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_POW_H
+#define LUCI_INTERPRETER_KERNELS_POW_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Pow : public Kernel
+{
+public:
+  Pow(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void eval() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_POW_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Pow.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.test.cpp
new file mode 100644
index 000000000..0e858115d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Pow.test.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pow.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class PowTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(PowTest, SimplePow)
+{
+  std::initializer_list<int32_t> base_shape = {1, 1, 3, 2};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f, 0.5f, 0.8f, 1.1f};
+  std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  std::vector<float> test_outputs{0.786f, 1.2838f, 1.043f, 0.7071f, 0.8f, 1.08956f};
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
+}
+
+TEST_F(PowTest, FloatBroadcastPow)
+{
+  std::initializer_list<int32_t> input1_shape = {1, 3};
+  std::initializer_list<int32_t> input2_shape = {3, 1};
+
+  std::vector<float> input1_data{0.3f, 2.3f, 0.9f};
+  std::vector<float> input2_data{0.2f, 0.3f, 0.4f};
+  std::vector<float> test_outputs{0.786f,   1.18126f, 0.9791f, 0.6968f, 1.28386f,
+                                  0.96888f, 0.6178f,  1.3953f, 0.9587f};
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
+}
+
+TEST_F(PowTest, IntPow)
+{
+  std::initializer_list<int32_t> base_shape = {1, 3};
+
+  std::vector<int32_t> input_data{2, 3, 4};
+  std::vector<int32_t> test_outputs{4, 27, 256};
+
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>(base_shape, input_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::S32>(base_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(test_outputs));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
+}
+
+TEST_F(PowTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::BOOL);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(PowTest, Input_Type_Mismatch_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {4}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(PowTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.cpp
new file mode 100644
index 000000000..0c8544a65
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Quantize.h"
+#include "kernels/Utils.h"
+#include "PALQuantize.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+namespace
+{
+
+template <typename input_dtype> void call_requantize(const Tensor *input, Tensor *output)
+{
+  int32_t multiplier;
+  int shift;
+
+  const double effective_output_scale = input->scale() / output->scale();
+  quantizeMultiplier(effective_output_scale, &multiplier, &shift);
+
+  const auto input_shape = getTensorShape(input);
+  const auto output_shape = getTensorShape(output);
+  const auto size = tflite::MatchingFlatSize(input_shape, output_shape);
+
+  const auto input_data = getTensorData<input_dtype>(input);
+
+  switch (output->element_type())
+  {
+    case loco::DataType::S8:
+      luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(),
+                                       output->zero_point(), getTensorData<int8_t>(output));
+      break;
+    case loco::DataType::U8:
+      luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(),
+                                       output->zero_point(), getTensorData<uint8_t>(output));
+      break;
+    case loco::DataType::S16:
+      luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(),
+                                       output->zero_point(), getTensorData<int16_t>(output));
+      break;
+    default:
+      throw std::runtime_error("Unsupported quantized type, yet!");
+  }
+}
+
+} // namespace
+
+Quantize::Quantize(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Quantize::configure()
+{
+
+  if (input()->element_type() == loco::DataType::S16)
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0);
+
+  switch (input()->element_type())
+  {
+    case loco::DataType::FLOAT32:
+    {
+      LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::U8 ||
+                             output()->element_type() == loco::DataType::S8 ||
+                             output()->element_type() == loco::DataType::S16);
+      break;
+    }
+    case loco::DataType::S16:
+    case loco::DataType::S8:
+    case loco::DataType::U8:
+    {
+      LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::S8 ||
+                             output()->element_type() == loco::DataType::U8 ||
+                             output()->element_type() == loco::DataType::S16);
+      if (output()->element_type() == loco::DataType::S16)
+      {
+        LUCI_INTERPRETER_CHECK(output()->zero_point() == 0);
+      }
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type");
+  }
+
+  output()->resize(input()->shape());
+}
+
+void Quantize::execute() const
+{
+  switch (input()->element_type())
+  {
+    case loco::DataType::FLOAT32:
+    {
+      tflite::QuantizationParams op_params;
+      op_params.zero_point = output()->zero_point();
+      op_params.scale = output()->scale();
+      const auto input_data = getTensorData<float>(input());
+
+      switch (output()->element_type())
+      {
+        case loco::DataType::S8:
+        {
+          luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data,
+                                         getTensorShape(output()), getTensorData<int8_t>(output()));
+          break;
+        }
+        case loco::DataType::U8:
+        {
+          luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data,
+                                         getTensorShape(output()),
+                                         getTensorData<uint8_t>(output()));
+          break;
+        }
+        case loco::DataType::S16:
+        {
+          luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data,
+                                         getTensorShape(output()),
+                                         getTensorData<int16_t>(output()));
+          break;
+        }
+        default:
+          throw std::runtime_error("Unsupported type.");
+      }
+      break;
+    }
+    case loco::DataType::S16:
+    {
+      call_requantize<int16_t>(input(), output());
+      break;
+    }
+    case loco::DataType::S8:
+    {
+      call_requantize<int8_t>(input(), output());
+      break;
+    }
+    case loco::DataType::U8:
+    {
+      call_requantize<uint8_t>(input(), output());
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.h b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.h
new file mode 100644
index 000000000..006c5366f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_QUANTIZE_H
+#define LUCI_INTERPRETER_KERNELS_QUANTIZE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Quantize : public Kernel
+{
+public:
+  Quantize(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_QUANTIZE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.test.cpp
new file mode 100644
index 000000000..22e67fe3f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Quantize.test.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Quantize.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class QuantizeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(QuantizeTest, FloatUint8)
+{
+  std::vector<float> input_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64};
+
+  std::vector<uint8_t> ref_output_data{0, 1, 2, 3, 4, 251, 252, 253, 254, 255};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, /*scale*/ 0.5, /*zero_point*/ 127);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(QuantizeTest, FloatInt8)
+{
+  std::vector<float> input_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64};
+
+  std::vector<int8_t> ref_output_data{-128, -127, -126, -125, -124, 123, 124, 125, 126, 127};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(QuantizeTest, FloatInt16)
+{
+  std::vector<float> input_data{-63.5, -63, -3, -2, -1, 1, 2, 3, 63.5, 64};
+
+  std::vector<int16_t> ref_output_data{-12700, -12600, -600, -400,  -200,
+                                       200,    400,    600,  12700, 12800};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, /*scale*/ 0.005, /*zero_point*/ 0);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int16_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5}));
+}
+
+TEST_F(QuantizeTest, Int16Int16)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  std::vector<int16_t> ref_output_data{2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>(
+    {1, 1, 2, 5}, /*scale*/ 1.0, /*zero_point*/ 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, /*scale*/ 0.5, /*zero_point*/ 0);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int16_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5}));
+}
+
+TEST_F(QuantizeTest, Int8Int8)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  std::vector<int8_t> ref_output_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  Tensor input_tensor = makeInputTensor<DataType::S8>(
+    {1, 1, 2, 5}, /*scale*/ 0.5, /*zero_point*/ -1, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5}));
+}
+
+TEST_F(QuantizeTest, Uint8Uint8)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  std::vector<uint8_t> ref_output_data{129, 131, 133, 135, 137, 139, 141, 143, 145, 147};
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 2, 5}, /*scale*/ 0.5, /*zero_point*/ 127, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, /*scale*/ 0.5, /*zero_point*/ 127);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5}));
+}
+
+TEST_F(QuantizeTest, Int16Int8)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  std::vector<int8_t> ref_output_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  Tensor input_tensor = makeInputTensor<DataType::S16>(
+    {1, 1, 2, 5}, /*scale*/ 1.0, /*zero_point*/ 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<int8_t>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5}));
+}
+
+TEST_F(QuantizeTest, InvalidInputType_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(QuantizeTest, InvalidOutputTypeForFloatInput_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 1, 2, 5}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(QuantizeTest, InvalidOutputTypeForInt16Input_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(QuantizeTest, InvalidOutputTypeForInt8Input_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(QuantizeTest, InvalidOutputTypeForUint8Input_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(QuantizeTest, InvalidInputZeroPoint_NEG)
+{
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 2, 5}, 0.5, -1, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+
+  Quantize kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.cpp
new file mode 100644
index 000000000..747ec6cc8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu.h"
+#include "kernels/Utils.h"
+
+#include "PALRelu.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Relu::Relu(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Relu::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::S16)
+  {
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+  }
+
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
+  {
+    double multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(multiplier, &_output_multiplier, &_output_shift);
+  }
+  output()->resize(input()->shape());
+}
+
+void Relu::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Relu::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto input_shape = getTensorShape(input());
+  auto output_data = getTensorData<float>(output());
+  auto output_shape = getTensorShape(output());
+
+  luci_interpreter_pal::Relu(input_shape, input_data, output_shape, output_data);
+}
+
+void Relu::evalQuantized() const
+{
+  tflite::ReluParams params;
+  params.input_offset = input()->zero_point();
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = _output_multiplier;
+  params.output_shift = _output_shift;
+
+  params.quantized_activation_min =
+    std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+  params.quantized_activation_max = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
+
+  luci_interpreter_pal::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                              getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+void Relu::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  constexpr int32_t output_min = 0;
+  constexpr int32_t output_max = std::numeric_limits<int16_t>::max();
+
+  const int32_t num_elements = input()->shape().num_elements();
+
+  for (int32_t i = 0; i < num_elements; ++i)
+  {
+    const int32_t input_val = input_data[i];
+    int32_t output_val =
+      tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier, _output_shift);
+    output_val = std::max(output_val, output_min);
+    output_val = std::min(output_val, output_max);
+    output_data[i] = static_cast<int16_t>(output_val);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu.h b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.h
new file mode 100644
index 000000000..b813f0cdf
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RELU_H
+#define LUCI_INTERPRETER_KERNELS_RELU_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Relu : public Kernel
+{
+public:
+  Relu(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _output_multiplier{0};
+  int32_t _output_shift{0};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RELU_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.test.cpp
new file mode 100644
index 000000000..bd32e3cc9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu.test.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ReluTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ReluTest, FloatSimple)
+{
+  std::vector<float> input_data{
+    0.0f, 1.0f,  3.0f,  // Row 1
+    1.0f, -1.0f, -2.0f, // Row 2
+  };
+
+  std::vector<float> ref_output_data{
+    0.0f, 1.0f, 3.0f, // Row 1
+    1.0f, 0.0f, 0.0f, // Row 2
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(ReluTest, Uint8Quantized)
+{
+  std::vector<float> input_data{
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
+  };
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float f_min = (-128.0 / 128.0) * 8;
+  const float f_max = (127.0 / 128.0) * 8;
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({128, 128, 160, 192, 176, 128, 240, 144}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
+}
+
+TEST_F(ReluTest, Uint8Requantized)
+{
+  std::vector<float> input_data{
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
+  };
+
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float in_min = (-128.0 / 128.0) * 8;
+  const float in_max = (127.0 / 128.0) * 8;
+  const float out_min = (0.0 / 256.0) * 8;
+  const float out_max = (255.0 / 256.0) * 8;
+
+  std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_input.first, quant_input.second, input_data, _memory_manager.get());
+
+  std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 64, 128, 96, 0, 224, 32}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
+}
+
+TEST_F(ReluTest, SInt16)
+{
+  std::vector<float> input_data{
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
+  };
+  std::vector<float> ref_output_data{
+    0, 0, 2, 4, //
+    3, 0, 7, 1, //
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 4, 1}, 0.5, 0, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.25, 0);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(ReluTest, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(ReluTest, Invalid_Input_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Relu kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.cpp
new file mode 100644
index 000000000..07205ed3a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu6.h"
+#include "kernels/Utils.h"
+
+#include "PALRelu6.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Relu6::Relu6(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Relu6::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  if (input()->element_type() == DataType::U8)
+  {
+    double multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(multiplier, &_output_multiplier, &_output_shift);
+  }
+  output()->resize(input()->shape());
+}
+
+void Relu6::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Relu6::evalFloat() const
+{
+  const auto input_data = getTensorData<float>(input());
+  const auto input_shape = getTensorShape(input());
+  auto output_data = getTensorData<float>(output());
+  auto output_shape = getTensorShape(output());
+
+  luci_interpreter_pal::Relu6(input_shape, input_data, output_shape, output_data);
+}
+
+void Relu6::evalQuantized() const
+{
+  tflite::ReluParams params;
+  params.input_offset = input()->zero_point();
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = _output_multiplier;
+  params.output_shift = _output_shift;
+
+  params.quantized_activation_min =
+    std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+  params.quantized_activation_max =
+    std::min(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()),
+             params.output_offset + static_cast<int32>(roundf(6.f / output()->scale())));
+
+  luci_interpreter_pal::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                              getTensorShape(output()), getTensorData<uint8_t>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.h b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.h
new file mode 100644
index 000000000..f5030b588
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RELU6_H
+#define LUCI_INTERPRETER_KERNELS_RELU6_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Relu6 : public Kernel
+{
+public:
+  Relu6(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+
+private:
+  int32_t _output_multiplier{0};
+  int32_t _output_shift{0};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RELU6_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.test.cpp
new file mode 100644
index 000000000..af7b3f3db
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Relu6.test.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Relu6.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class Relu6Test : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(Relu6Test, FloatSimple)
+{
+  std::vector<float> input_data{
+    0.0f, 1.0f,  3.0f,  // Row 1
+    7.0f, -1.0f, -2.0f, // Row 2
+  };
+
+  std::vector<float> ref_output_data{
+    0.0f, 1.0f, 3.0f, // Row 1
+    6.0f, 0.0f, 0.0f, // Row 2
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
+}
+
+TEST_F(Relu6Test, Uint8Quantized)
+{
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float f_min = (-128.0 / 128.0) * 10;
+  const float f_max = (127.0 / 128.0) * 10;
+  const float tolerance = (f_max - f_min) / 255.0;
+
+  std::vector<float> input_data{
+    0,  -6, 2, 8, //
+    -2, 3,  7, 1, //
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({128, 128, 154, 205, 128, 166, 205, 141}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
+}
+
+TEST_F(Relu6Test, Uint8Requantized)
+{
+  // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
+  const float in_min = (-128.0 / 128.0) * 10;
+  const float in_max = (127.0 / 128.0) * 10;
+  const float out_min = (0.0 / 256.0) * 0;
+  const float out_max = (255.0 / 256.0) * 6;
+  const float tolerance = (in_max - in_min) / 255.0;
+
+  std::vector<float> input_data{
+    0,  -6, 2, 8, //
+    -2, 3,  7, 1, //
+  };
+
+  std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_input.first, quant_input.second, input_data, _memory_manager.get());
+
+  std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
+  EXPECT_THAT(extractTensorData<uint8_t>(output_tensor),
+              ::testing::ElementsAreArray({0, 0, 87, 255, 0, 127, 255, 43}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
+}
+
+TEST_F(Relu6Test, Input_Output_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(Relu6Test, Invalid_Input_Type_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Relu6 kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.cpp
new file mode 100644
index 000000000..61d3300b2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reshape.h"
+
+#include <cassert>
+#include <cstring>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+static Shape extractShapeFromTensor(const Tensor *tensor)
+{
+  assert(tensor->element_type() == DataType::S32);
+  Shape shape(tensor->shape().num_elements());
+  const auto *shape_data = tensor->data<int32_t>();
+  for (int i = 0; i < tensor->shape().num_elements(); ++i)
+  {
+    shape.dim(i) = shape_data[i];
+  }
+  return shape;
+}
+
+static void resolveUnknownDimension(const Shape &input_shape, Shape *output_shape)
+{
+  const int32_t num_input_elements = input_shape.num_elements();
+  int32_t num_output_elements = 1;
+  int unknown_dim_index = -1;
+  for (int i = 0; i < output_shape->num_dims(); ++i)
+  {
+    const int32_t value = output_shape->dim(i);
+    if (value == -1)
+    {
+      assert(unknown_dim_index == -1);
+      unknown_dim_index = i;
+    }
+    else
+    {
+      num_output_elements *= value;
+    }
+  }
+  if (unknown_dim_index != -1)
+  {
+    output_shape->dim(unknown_dim_index) = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->dim(unknown_dim_index);
+  }
+  assert(num_output_elements == num_input_elements);
+}
+
+Reshape::Reshape(const Tensor *input, const Tensor *shape, Tensor *output)
+  : Kernel({input, shape}, {output})
+{
+}
+
+void Reshape::configure()
+{
+  Shape output_shape = extractShapeFromTensor(shape());
+  resolveUnknownDimension(input()->shape(), &output_shape);
+  output()->resize(output_shape);
+}
+
+void Reshape::execute() const
+{
+  const auto *input_data = input()->data<void>();
+  auto *output_data = output()->data<void>();
+
+  const size_t element_size = getDataTypeSize(input()->element_type());
+  const int32_t num_elements = input()->shape().num_elements();
+  std::memcpy(output_data, input_data, num_elements * element_size);
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.h b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.h
new file mode 100644
index 000000000..99b947f77
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RESHAPE_H
+#define LUCI_INTERPRETER_KERNELS_RESHAPE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Reshape : public Kernel
+{
+public:
+  Reshape(const Tensor *input, const Tensor *shape, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *shape() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RESHAPE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.test.cpp
new file mode 100644
index 000000000..c2ff3ea1b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Reshape.test.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reshape.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ReshapeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+// TODO Test types other than FLOAT32.
+
+TEST_F(ReshapeTest, Regular)
+{
+  Shape input_shape{1, 2, 2, 3};
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Shape shape_shape{2};
+  std::vector<int32_t> shape_data{3, 4};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor shape_tensor =
+    makeInputTensor<DataType::S32>(shape_shape, shape_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Reshape kernel(&input_tensor, &shape_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
+}
+
+TEST_F(ReshapeTest, UnknownDimension)
+{
+  Shape input_shape{2, 1, 2, 3};
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Shape shape_shape{3};
+  std::vector<int32_t> shape_data{2, -1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor shape_tensor =
+    makeInputTensor<DataType::S32>(shape_shape, shape_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Reshape kernel(&input_tensor, &shape_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.cpp
new file mode 100644
index 000000000..e2ddd6a7b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeBilinear.h"
+
+#include "kernels/Utils.h"
+
+#include "PALResizeBilinear.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ResizeBilinear::ResizeBilinear(const Tensor *input, const Tensor *size, Tensor *output,
+                               const ResizeBilinearParams &params)
+  : KernelWithParams<ResizeBilinearParams>({input, size}, {output}, params)
+{
+}
+
+void ResizeBilinear::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(size()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(size()->element_type() == DataType::S32);
+  if (params().half_pixel_centers && params().align_corners)
+    throw std::runtime_error("If half_pixel_centers is True, align_corners must be False.");
+  LUCI_INTERPRETER_CHECK(size()->shape().dim(0) == 2);
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = getTensorData<int32_t>(size())[0];
+  output_shape.dim(2) = getTensorData<int32_t>(size())[1];
+  output_shape.dim(3) = input()->shape().dim(3);
+  output()->resize(output_shape);
+}
+
+void ResizeBilinear::execute() const
+{
+  tflite::ResizeBilinearParams op_params{};
+  op_params.align_corners = params().align_corners;
+  op_params.half_pixel_centers = params().half_pixel_centers;
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::ResizeBilinear(
+        op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::ResizeBilinear(
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.h b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.h
new file mode 100644
index 000000000..b7bdc2ab7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ResizeBilinear : public KernelWithParams<ResizeBilinearParams>
+{
+public:
+  ResizeBilinear(const Tensor *input, const Tensor *shape, Tensor *output,
+                 const ResizeBilinearParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RESIZEBILINEAR_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
new file mode 100644
index 000000000..933a1128c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeBilinear.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> size_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
+           bool align_corners, bool half_pixel_centers)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> size_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<int32_t> size_data,
+                    std::initializer_list<float> output_data, bool align_corners,
+                    bool half_pixel_centers)
+{
+  // On TFlite example use Uint8 value it self, so this means quant param scale 1.0f and zero
+  // point 0.
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, 1.0, 0, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0, 0);
+
+  ResizeBilinearParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class ResizeBilinearTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(ResizeBilinearTest, DataTypes);
+
+TYPED_TEST(ResizeBilinearTest, SimpleTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
+                   },
+                   {3, 3},
+                   {
+                     3, 5, 6,    //
+                     7, 9, 10,   //
+                     9, 11, 12,  //
+                     4, 8, 10,   //
+                     8, 12, 14,  //
+                     10, 14, 16, //
+                   },
+                   false, false);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, HalfPixelCenterFloatTest)
+{
+  Check<float>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+               {
+                 1, 2, //
+                 3, 4, //
+                 1, 2, //
+                 3, 4  //
+               },
+               {3, 3},
+               {
+                 1, 1.5, 2, //
+                 2, 2.5, 3, //
+                 3, 3.5, 4, //
+                 1, 1.5, 2, //
+                 2, 2.5, 3, //
+                 3, 3.5, 4, //
+               },
+               false, true);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, HalfPixelCenterUint8Test)
+{
+  Check<uint8_t>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                 {
+                   3, 6,  //
+                   9, 12, //
+                   4, 10, //
+                   12, 16 //
+                 },
+                 {3, 3},
+                 {
+                   2, 4, 6,    //
+                   6, 7, 9,    //
+                   9, 10, 12,  //
+                   4, 7, 10,   //
+                   8, 10, 13,  //
+                   12, 14, 16, //
+                 },
+                 false, true);
+  SUCCEED();
+}
+
+TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeBilinearTest, InvalidParams_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeBilinearParams params{};
+  params.align_corners = true;
+  params.half_pixel_centers = true;
+
+  ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
new file mode 100644
index 000000000..306cefbc2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeNearestNeighbor.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+#include "PALResizeNearestNeighbor.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ResizeNearestNeighbor::ResizeNearestNeighbor(const Tensor *input, const Tensor *size,
+                                             Tensor *output,
+                                             const ResizeNearestNeighborParams &params)
+  : KernelWithParams<ResizeNearestNeighborParams>({input, size}, {output}, params)
+{
+}
+
+void ResizeNearestNeighbor::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() == 4);
+  LUCI_INTERPRETER_CHECK(size()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(size()->element_type() == DataType::S32);
+  LUCI_INTERPRETER_CHECK(size()->shape().dim(0) == 2);
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = getTensorData<int32_t>(size())[0];
+  output_shape.dim(2) = getTensorData<int32_t>(size())[1];
+  output_shape.dim(3) = input()->shape().dim(3);
+  output()->resize(output_shape);
+}
+
+void ResizeNearestNeighbor::execute() const
+{
+  tflite::ResizeNearestNeighborParams op_params{};
+  op_params.align_corners = params().align_corners;
+  op_params.half_pixel_centers = params().half_pixel_centers;
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::ResizeNearestNeighbor(
+        op_params, getTensorShape(input()), getTensorData<int32_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<int32_t>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::ResizeNearestNeighbor(
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.h b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.h
new file mode 100644
index 000000000..137d031cf
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ResizeNearestNeighbor : public KernelWithParams<ResizeNearestNeighborParams>
+{
+public:
+  ResizeNearestNeighbor(const Tensor *input, const Tensor *shape, Tensor *output,
+                        const ResizeNearestNeighborParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
new file mode 100644
index 000000000..7ade02a6f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ResizeNearestNeighbor.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> size_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
+           bool align_corners, bool half_pixel_centers)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+}
+
+template <>
+void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
+                    std::initializer_list<int32_t> size_shape,
+                    std::initializer_list<int32_t> output_shape,
+                    std::initializer_list<float> input_data,
+                    std::initializer_list<int32_t> size_data,
+                    std::initializer_list<float> output_data, bool align_corners,
+                    bool half_pixel_centers)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::pair<float, int32_t> quant_param =
+    quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.first);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = align_corners;
+  params.half_pixel_centers = half_pixel_centers;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class ResizeNearestNeighborTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(ResizeNearestNeighborTest, DataTypes);
+
+TYPED_TEST(ResizeNearestNeighborTest, SimpleTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
+                   },
+                   {3, 3},
+                   {
+                     3, 3, 6,    //
+                     3, 3, 6,    //
+                     9, 9, 12,   //
+                     4, 4, 10,   //
+                     4, 4, 10,   //
+                     10, 10, 16, //
+                   },
+                   false, false);
+}
+
+TYPED_TEST(ResizeNearestNeighborTest, AlignCenterTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
+                   },
+                   {3, 3},
+                   {
+                     3, 6, 6,    //
+                     9, 12, 12,  //
+                     9, 12, 12,  //
+                     4, 10, 10,  //
+                     10, 16, 16, //
+                     10, 16, 16, //
+                   },
+                   true, false);
+}
+
+TYPED_TEST(ResizeNearestNeighborTest, HalfPixelCenterTest)
+{
+  Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
+                   {
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
+                   },
+                   {3, 3},
+                   {
+                     3, 6, 6,    //
+                     9, 12, 12,  //
+                     9, 12, 12,  //
+                     4, 10, 10,  //
+                     10, 16, 16, //
+                     10, 16, 16, //
+                   },
+                   false, true);
+}
+
+TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(ResizeNearestNeighborTest, SizeDimInvalid_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = false;
+  params.half_pixel_centers = false;
+
+  ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.cpp
new file mode 100644
index 000000000..1b6a5cc3b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReverseV2.h"
+#include "kernels/Utils.h"
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+ReverseV2::ReverseV2(const Tensor *input, const Tensor *axes, Tensor *output)
+  : Kernel({input, axes}, {output})
+{
+}
+
+void ReverseV2::configure()
+{
+  assert(axes()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() >= axes()->shape().num_elements());
+  if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
+      input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
+      input()->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Unsupported input type.");
+  }
+  if (axes()->element_type() != DataType::S32)
+  {
+    throw std::runtime_error("Unsupported axes type.");
+  }
+  if (axes()->shape().num_elements() > 1)
+  {
+    throw std::runtime_error("Current implementation does not support more than 1 axis.");
+  }
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  if (axis_value < 0 || axis_value >= input()->shape().num_dims())
+  {
+    throw std::runtime_error("Invalid axes value");
+  }
+  assert(input()->element_type() == output()->element_type());
+
+  output()->resize(input()->shape());
+}
+
+void ReverseV2::execute() const
+{
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
+                                            getTensorData<float>(input()), getTensorShape(output()),
+                                            getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::reference_ops::Reverse<uint8_t>(
+        axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
+        getTensorShape(output()), getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported output type");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.h b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.h
new file mode 100644
index 000000000..51211c703
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
+#define LUCI_INTERPRETER_KERNELS_REVERSE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ReverseV2 : public Kernel
+{
+public:
+  ReverseV2(const Tensor *input, const Tensor *axes, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.test.cpp
new file mode 100644
index 000000000..c0025faca
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/ReverseV2.test.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReverseV2.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class ReverseV2Test : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(ReverseV2Test, DataTypes);
+
+TYPED_TEST(ReverseV2Test, MultiDimensions)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  // TypeParam
+  std::vector<TypeParam> input_data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+  Shape input_shape{4, 3, 2};
+  std::vector<int32_t> axis_data{1};
+  Shape axis_shape{1};
+
+  std::vector<TypeParam> output_data{5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                                     17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
+  std::vector<int32_t> output_shape{4, 3, 2};
+
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data, memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  ReverseV2 kernel = ReverseV2(&input_tensor, &axis_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.cpp
new file mode 100644
index 000000000..6dd92dc98
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Rsqrt.h"
+#include "kernels/Utils.h"
+
+#include <stdexcept>
+#include <cmath>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Rsqrt::Rsqrt(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Rsqrt::configure()
+{
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Input/output tensor data type mismatch.");
+  }
+  output()->resize(input()->shape());
+}
+
+void Rsqrt::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Rsqrt::evalFloat() const
+{
+  auto in = getTensorData<float>(input());
+  auto out = getTensorData<float>(output());
+  auto size = getTensorShape(input()).FlatSize();
+  for (auto i = in; i != in + size; ++i)
+  {
+    *out = 1.f / std::sqrt(*i);
+    ++out;
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.h b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.h
new file mode 100644
index 000000000..adc5bcfa2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_RSQRT_H
+#define LUCI_INTERPRETER_KERNELS_RSQRT_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Rsqrt : public Kernel
+{
+public:
+  Rsqrt(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_RSQRT_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.test.cpp
new file mode 100644
index 000000000..3c6494232
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Rsqrt.test.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Rsqrt.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Rsqrt kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(RsqrtTest, SimpleRsqrt)
+{
+  Check(
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      5, 4, 8, 2,     //
+      6, 7.5, 9, 0.3, //
+    },
+    /*output_data=*/
+    {
+      0.44721360, 0.5, 0.35355339, 0.70710678,       //
+      0.40824829, 0.36514837, 0.33333333, 1.8257419, //
+    });
+}
+
+TEST(RsqrtTest, Input_Output_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Rsqrt kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(RsqrtTest, Invalid_Input_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Rsqrt kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.cpp
new file mode 100644
index 000000000..40d79aaa3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SVDF.h"
+#include "kernels/Utils.h"
+#include "PALSVDF.h"
+
+#include <tensorflow/lite/kernels/internal/quantization_util.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+namespace
+{
+TfLiteFusedActivation get_tflite_activation(Activation activation)
+{
+  switch (activation)
+  {
+    case luci::FusedActFunc::RELU:
+      return kTfLiteActRelu;
+    case luci::FusedActFunc::RELU6:
+      return kTfLiteActRelu6;
+    case luci::FusedActFunc::RELU_N1_TO_1:
+      return kTfLiteActReluN1To1;
+    case luci::FusedActFunc::TANH:
+      return kTfLiteActTanh;
+    case luci::FusedActFunc::SIGN_BIT:
+      return kTfLiteActSignBit;
+    case luci::FusedActFunc::NONE:
+      return kTfLiteActNone;
+    default:
+      throw std::runtime_error("Unsupported activation type");
+  }
+}
+} // namespace
+
+SVDF::SVDF(const Tensor *input, const Tensor *weight_feature, const Tensor *weight_time,
+           const Tensor *bias, const Tensor *input_activation_state, Tensor *output,
+           Tensor *scratchpad_activation_state, Tensor *scratchpad_1, Tensor *scratchpad_2,
+           Tensor *scratchpad_3, Tensor *scratchpad_4, Tensor *scratchpad_5, Tensor *scratchpad_6,
+           const SVDFParams &params)
+  : KernelWithParams<SVDFParams>({input, weight_feature, weight_time, bias, input_activation_state},
+                                 {output, scratchpad_activation_state, scratchpad_1, scratchpad_2,
+                                  scratchpad_3, scratchpad_4, scratchpad_5, scratchpad_6},
+                                 params)
+{
+  // Do nothing
+}
+
+void SVDF::configure()
+{
+  const Shape &input_shape = input()->shape();
+  const Shape &weight_features_shape = weight_feature()->shape();
+  const Shape &weight_time_shape = weight_time()->shape();
+
+  // Validate Input Tensor:
+  LUCI_INTERPRETER_CHECK(input()->element_type() == loco::DataType::FLOAT32 ||
+                         input()->element_type() == loco::DataType::S8);
+  LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 2);
+
+  // Validate inputs and output types
+  if (input()->element_type() == loco::DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(weight_feature()->element_type() == loco::DataType::S8);
+    LUCI_INTERPRETER_CHECK(weight_time()->element_type() == loco::DataType::S16 ||
+                           weight_time()->element_type() == loco::DataType::S8);
+    if (bias())
+      LUCI_INTERPRETER_CHECK(bias()->element_type() == loco::DataType::S32);
+
+    LUCI_INTERPRETER_CHECK(input_activation_state()->element_type() == loco::DataType::S16 ||
+                           input_activation_state()->element_type() == loco::DataType::S8);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::S8);
+
+    // Note: now tflite support only ReLU activation for integer SVDF
+    LUCI_INTERPRETER_CHECK(params().activation == luci::FusedActFunc::RELU);
+  }
+  else if (weight_feature()->element_type() == loco::DataType::FLOAT32)
+  {
+    LUCI_INTERPRETER_CHECK(weight_feature()->element_type() == loco::DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(weight_time()->element_type() == loco::DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(input_activation_state()->element_type() == loco::DataType::FLOAT32);
+    if (bias())
+      LUCI_INTERPRETER_CHECK(bias()->element_type() == loco::DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::FLOAT32);
+  }
+  else if ((weight_feature()->element_type() == loco::DataType::U8 ||
+            weight_feature()->element_type() == loco::DataType::S8) &&
+           input()->element_type() == loco::DataType::FLOAT32)
+  {
+    // TODO:: support hybrid SVDF op
+    throw std::runtime_error("Hybrid type is not currently supported");
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int rank = params().svdf_rank;
+  const int batch_size = input_shape.dim(0);
+  const int num_filters = weight_features_shape.dim(0);
+  LUCI_INTERPRETER_CHECK(rank != 0);
+  LUCI_INTERPRETER_CHECK(num_filters % rank == 0);
+
+  const int num_units = num_filters / rank;
+  const int memory_size = weight_time_shape.dim(1);
+
+  // Validate Weight_Feature Input Tensor:
+  LUCI_INTERPRETER_CHECK(weight_features_shape.num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(weight_features_shape.dim(1) == input_shape.dim(1));
+
+  // Validate Weight_Time Input Tensor:
+  LUCI_INTERPRETER_CHECK(weight_time_shape.num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(weight_time_shape.dim(0) == num_filters);
+
+  // Validate Bias
+  if (bias())
+    LUCI_INTERPRETER_CHECK(bias()->shape().dim(0) == num_units);
+
+  // Validate Input Activation State
+  LUCI_INTERPRETER_CHECK(input_activation_state()->shape().num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(input_activation_state()->shape().dim(0) == batch_size);
+  LUCI_INTERPRETER_CHECK(input_activation_state()->shape().dim(1) == memory_size * num_filters);
+
+  // Resize scratchpad_state to input_activation_state
+  auto scratchpad_activation_state = getOutputTensors()[1];
+  scratchpad_activation_state->resize({batch_size, memory_size * num_filters});
+
+  // Resize output tensor
+  output()->resize({batch_size, num_units});
+
+  luci_interpreter_pal::SetupScratchpadTensor(
+    input()->element_type(), weight_feature()->element_type(), getOutputTensors()[2],
+    getOutputTensors()[3], getOutputTensors()[4], getOutputTensors()[5], getOutputTensors()[6],
+    getOutputTensors()[7], input_shape, weight_time_shape, batch_size, num_filters, num_units);
+}
+
+void SVDF::execute() const
+{
+  switch (weight_feature()->element_type())
+  {
+    case loco::DataType::FLOAT32:
+      evalFloat();
+      break;
+    case loco::DataType::S8:
+    {
+      if (input()->element_type() == loco::DataType::S8)
+        evalInteger();
+      else
+        // TODO:: support hybrid SVDF op
+        throw std::runtime_error("Hybrid type is not currently supported");
+      break;
+    }
+    default:
+      throw std::runtime_error("Unsupported type");
+  }
+}
+
+void SVDF::evalInteger() const
+{
+  const auto effective_scale_1 = static_cast<double>(input()->scale() * weight_feature()->scale() /
+                                                     input_activation_state()->scale());
+  const auto effective_scale_2 = static_cast<double>(input_activation_state()->scale() *
+                                                     weight_time()->scale() / output()->scale());
+
+  int32_t effective_scale_1_a;
+  int effective_scale_1_b;
+  int32_t effective_scale_2_a;
+  int effective_scale_2_b;
+
+  tflite::QuantizeMultiplier(effective_scale_1, &effective_scale_1_a, &effective_scale_1_b);
+  tflite::QuantizeMultiplier(effective_scale_2, &effective_scale_2_a, &effective_scale_2_b);
+
+  TfLiteSVDFParams params_svdf{};
+  params_svdf.asymmetric_quantize_inputs = params().asymmetric_quantize_inputs;
+  params_svdf.rank = params().svdf_rank;
+  params_svdf.activation = get_tflite_activation(params().activation);
+
+  auto scratchpad_activation_state = getOutputTensors()[1];
+  // Note: it is expected that activation_state input variable tensor reset to zero,
+  // also expected that this variable tensor doesn't have buffer
+  auto scratchpad_data = getTensorData<int16_t>(scratchpad_activation_state);
+  std::fill_n(scratchpad_data, scratchpad_activation_state->shape().num_elements(), 0);
+
+  auto scratchpad = getOutputTensors()[2];
+  auto output_temp = getOutputTensors()[3];
+
+  int32_t input_zp = input()->zero_point();
+  int32_t output_zp = output()->zero_point();
+  luci_interpreter_pal::IntegerSVDF(
+    params_svdf, getTensorShape(input()), getTensorData<int8_t>(input()),
+    getTensorShape(weight_feature()), getTensorData<int8_t>(weight_feature()),
+    getTensorShape(weight_time()), getTensorData<int16_t>(weight_time()), getTensorShape(bias()),
+    getTensorData<int32_t>(bias()), scratchpad_data, getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorData<int32_t>(scratchpad),
+    getTensorData<int32_t>(output_temp), effective_scale_1_a, effective_scale_1_b,
+    effective_scale_2_a, effective_scale_2_b, input_zp, output_zp);
+}
+
+void SVDF::evalFloat() const
+{
+  TfLiteSVDFParams params_svdf{};
+  params_svdf.asymmetric_quantize_inputs = params().asymmetric_quantize_inputs;
+  params_svdf.rank = params().svdf_rank;
+  params_svdf.activation = get_tflite_activation(params().activation);
+
+  auto scratchpad_activation_state = getOutputTensors()[1];
+  // Note: it is expected that activation_state input variable tensor reset to zero,
+  // also expected that this variable tensor doesn't have buffer
+  auto scratchpad_data = getTensorData<float>(scratchpad_activation_state);
+  std::fill_n(scratchpad_data, scratchpad_activation_state->shape().num_elements(), 0);
+
+  auto scratchpad_1 = getOutputTensors()[2];
+
+  luci_interpreter_pal::FloatSVDF(
+    params_svdf, getTensorShape(input()), getTensorData<float>(input()),
+    getTensorShape(weight_feature()), getTensorData<float>(weight_feature()),
+    getTensorShape(weight_time()), getTensorData<float>(weight_time()), getTensorShape(bias()),
+    getTensorData<float>(bias()), getTensorData<float>(scratchpad_1), scratchpad_data,
+    getTensorShape(output()), getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.h b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.h
new file mode 100644
index 000000000..335a6cd8f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SVDF_H
+#define LUCI_INTERPRETER_KERNELS_SVDF_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SVDF : public KernelWithParams<SVDFParams>
+{
+public:
+  SVDF(const Tensor *input, const Tensor *weight_feature, const Tensor *weight_time,
+       const Tensor *bias, const Tensor *input_activation_state, Tensor *output,
+       Tensor *scratchpad_activation_state, Tensor *scratchpad_1, Tensor *scratchpad_2,
+       Tensor *scratchpad_3, Tensor *scratchpad_4, Tensor *scratchpad_5, Tensor *scratchpad_6,
+       const SVDFParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *weight_feature() const { return _inputs[1]; }
+  const Tensor *weight_time() const { return _inputs[2]; }
+  const Tensor *bias() const { return _inputs[3]; }
+  const Tensor *input_activation_state() const { return _inputs[4]; }
+
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalInteger() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SVDF_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.test.cpp
new file mode 100644
index 000000000..82bd9b009
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SVDF.test.cpp
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SVDF.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class SVDFTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(SVDFTest, FullIntegerTest)
+{
+  const int32_t batches = 2;
+  const int32_t input_size = 3;
+  const int32_t units = 4;
+  const int32_t memory_size = 10;
+  const int32_t rank = 1;
+  const int32_t num_filters = units * rank;
+
+  Shape input_shape{batches, input_size};
+  Shape weight_feature_shape{num_filters, input_size};
+  Shape weight_time_shape{num_filters, memory_size};
+  Shape bias_shape{units};
+  Shape activation_state_shape{batches, memory_size * num_filters};
+
+  std::vector<float> input_data{0.49837467, 0.19278903, 0.26584083,
+                                0.17660543, 0.52949083, -0.77931279};
+
+  std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667,   0.37613347,
+                                         0.22197971,  0.12416199,  0.27901134,  0.27557442,
+                                         0.3905206,   -0.36137494, -0.06634006, -0.10640851};
+
+  std::vector<float> weight_time_data{
+    -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+    0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+    0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+    -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+    -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+    0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+    -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+    -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
+
+  std::vector<float> bias_data{-0.0976817, 0.15294972, 0.39635518, -0.02702999};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-1, 1);
+  std::pair<float, int32_t> weight_feature_quant_param = quantizationParams<int8_t>(-0.5, 0.5);
+  std::pair<float, int32_t> weight_time_quant_param = quantizationParams<int16_t>(-1, 1);
+  std::pair<float, int32_t> bias_quant_param = quantizationParams<int32_t>(-512, 512);
+  std::pair<float, int32_t> activation_state_quant_param = quantizationParams<int16_t>(-16, 16);
+
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-0.5, 0.5);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor weight_feature_tensor = makeInputTensor<DataType::S8>(
+    weight_feature_shape, weight_feature_quant_param.first, weight_feature_quant_param.second,
+    weight_feature_data, _memory_manager.get());
+  Tensor weight_time_tensor = makeInputTensor<DataType::S16>(
+    weight_time_shape, weight_time_quant_param.first, weight_time_quant_param.second,
+    weight_time_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(
+    bias_shape, bias_quant_param.first, bias_quant_param.second, bias_data, _memory_manager.get());
+  Tensor activation_state_tensor = makeOutputTensor(
+    DataType::S16, activation_state_quant_param.first, activation_state_quant_param.second);
+  activation_state_tensor.resize(activation_state_shape);
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  Tensor scratchpad_activation_state(DataType::S16, Shape({}), {}, "");
+  Tensor scratchpad_1(DataType::S32, Shape({}), {}, "");
+  Tensor scratchpad_2(DataType::S32, Shape({}), {}, "");
+  Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, "");
+
+  SVDFParams params{};
+  params.activation = Activation::RELU;
+  params.asymmetric_quantize_inputs = false;
+  params.svdf_rank = rank;
+
+  SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, &bias_tensor,
+              &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1,
+              &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad_activation_state);
+  _memory_manager->allocate_memory(scratchpad_1);
+  _memory_manager->allocate_memory(scratchpad_2);
+  _memory_manager->allocate_memory(scratchpad_3);
+  _memory_manager->allocate_memory(scratchpad_4);
+  _memory_manager->allocate_memory(scratchpad_5);
+  _memory_manager->allocate_memory(scratchpad_6);
+  kernel.execute();
+
+  std::vector<int8_t> ref_output_data{-9, 24, 31, 1, -10, 10, -3, 0};
+
+  std::vector<int32_t> ref_output_shape{batches, units};
+  EXPECT_THAT(extractTensorData<int8_t>(output_tensor), ref_output_data);
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(SVDFTest, FloatTest)
+{
+  const int32_t batches = 2;
+  const int32_t input_size = 3;
+  const int32_t units = 4;
+  const int32_t memory_size = 10;
+  const int32_t rank = 1;
+  const int32_t num_filters = units * rank;
+
+  Shape input_shape{batches, input_size};
+  Shape weight_feature_shape{num_filters, input_size};
+  Shape weight_time_shape{num_filters, memory_size};
+  Shape activation_state_shape{batches, memory_size * num_filters};
+
+  std::vector<float> input_data{0.12609188, -0.46347019, -0.89598465,
+                                0.35867718, 0.36897406,  0.73463392};
+
+  std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667,   0.37613347,
+                                         0.22197971,  0.12416199,  0.27901134,  0.27557442,
+                                         0.3905206,   -0.36137494, -0.06634006, -0.10640851};
+
+  std::vector<float> weight_time_data{
+    -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+    0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+    0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+    -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+    -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+    0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+    -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+    -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>(
+    weight_feature_shape, weight_feature_data, _memory_manager.get());
+  Tensor weight_time_tensor =
+    makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get());
+  Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32);
+  activation_state_tensor.resize(activation_state_shape);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, "");
+
+  SVDFParams params{};
+  params.activation = Activation::NONE;
+  params.asymmetric_quantize_inputs = false;
+  params.svdf_rank = rank;
+
+  SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr,
+              &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1,
+              &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(scratchpad_activation_state);
+  _memory_manager->allocate_memory(scratchpad_1);
+  _memory_manager->allocate_memory(scratchpad_2);
+  _memory_manager->allocate_memory(scratchpad_3);
+  _memory_manager->allocate_memory(scratchpad_4);
+  _memory_manager->allocate_memory(scratchpad_5);
+  _memory_manager->allocate_memory(scratchpad_6);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.014899,    -0.0517661, -0.143725, -0.00271883,
+                                     -0.03004015, 0.09565311, 0.1587342, 0.00784263};
+
+  std::vector<float> ref_output_shape{batches, units};
+  const float tolerance = 1e-5;
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data, tolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(SVDFTest, Unsupported_Type_Configure_NEG)
+{
+  const int32_t batches = 2;
+  const int32_t input_size = 3;
+  const int32_t units = 4;
+  const int32_t memory_size = 10;
+  const int32_t rank = 1;
+  const int32_t num_filters = units * rank;
+
+  Shape input_shape{batches, input_size};
+  Shape weight_feature_shape{num_filters, input_size};
+  Shape weight_time_shape{num_filters, memory_size};
+  Shape activation_state_shape{batches, memory_size * num_filters};
+
+  std::vector<int32_t> input_data{0, 1, 3, 4, 4, -2};
+
+  std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667,   0.37613347,
+                                         0.22197971,  0.12416199,  0.27901134,  0.27557442,
+                                         0.3905206,   -0.36137494, -0.06634006, -0.10640851};
+
+  std::vector<float> weight_time_data{
+    -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+    0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+    0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+    -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+    -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+    0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+    -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+    -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>(
+    weight_feature_shape, weight_feature_data, _memory_manager.get());
+  Tensor weight_time_tensor =
+    makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get());
+  Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32);
+  activation_state_tensor.resize(activation_state_shape);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, "");
+
+  SVDFParams params{};
+  params.activation = Activation::NONE;
+  params.asymmetric_quantize_inputs = false;
+  params.svdf_rank = rank;
+
+  SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr,
+              &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1,
+              &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(SVDFTest, Invalid_Input_Shape_NEG)
+{
+  const int32_t batches = 2;
+  const int32_t right_input_size = 3;
+  const int32_t wrong_input_size = 4;
+  const int32_t units = 4;
+  const int32_t memory_size = 10;
+  const int32_t rank = 1;
+  const int32_t num_filters = units * rank;
+
+  Shape input_shape{batches, wrong_input_size};
+  Shape weight_feature_shape{num_filters, right_input_size};
+  Shape weight_time_shape{num_filters, memory_size};
+  Shape activation_state_shape{batches, memory_size * num_filters};
+
+  std::vector<float> input_data{0, 1, 3, 2, 4, 4, -2, 1};
+
+  std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667,   0.37613347,
+                                         0.22197971,  0.12416199,  0.27901134,  0.27557442,
+                                         0.3905206,   -0.36137494, -0.06634006, -0.10640851};
+
+  std::vector<float> weight_time_data{
+    -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+    0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+    0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+    -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+    -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+    0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+    -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+    -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>(
+    weight_feature_shape, weight_feature_data, _memory_manager.get());
+  Tensor weight_time_tensor =
+    makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get());
+  Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32);
+  activation_state_tensor.resize(activation_state_shape);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, "");
+  Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, "");
+
+  SVDFParams params{};
+  params.activation = Activation::NONE;
+  params.asymmetric_quantize_inputs = false;
+  params.svdf_rank = rank;
+
+  SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr,
+              &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1,
+              &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Shape.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.cpp
new file mode 100644
index 000000000..0429fe1e5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/Utils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ShapeKernel::ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params)
+  : KernelWithParams<ShapeParams>({input}, {output}, params)
+{
+}
+
+void ShapeKernel::configure()
+{
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::S32 or
+                         output()->element_type() == DataType::S64);
+  const auto input_shape = input()->shape();
+
+  Shape output_shape(1);
+  output_shape.dim(0) = input_shape.num_dims();
+
+  output()->resize(output_shape);
+}
+
+void ShapeKernel::execute() const
+{
+  switch (params().out_type)
+  {
+    case DataType::S32:
+      evalInt<int32_t>();
+      break;
+    case DataType::S64:
+      evalInt<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void ShapeKernel::evalInt() const
+{
+  const auto input_shape = input()->shape();
+
+  auto output_data = getTensorData<T>(output());
+
+  for (int i = 0; i < input_shape.num_dims(); ++i)
+  {
+    output_data[i] = input_shape.dim(i);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Shape.h b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.h
new file mode 100644
index 000000000..cfaadec91
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SHAPE_H
+#define LUCI_INTERPRETER_KERNELS_SHAPE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ShapeKernel : public KernelWithParams<ShapeParams>
+{
+public:
+  ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalInt() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SHAPE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Shape.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.test.cpp
new file mode 100644
index 000000000..4763e016c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Shape.test.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ShapeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T> void runShapeKernel(loco::DataType dataType, IMemoryManager *memory_manager)
+{
+  Shape input_shape{1, 3, 1, 3, 5};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(dataType);
+
+  ShapeParams params{};
+  params.out_type = dataType;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{1, 3, 1, 3, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{5};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(ShapeTest, OutTypeInt)
+{
+
+  // Run for int32_t output
+  runShapeKernel<int32_t>(loco::DataType::S32, _memory_manager.get());
+  // Run for int64_t output
+  runShapeKernel<int64_t>(loco::DataType::S64, _memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(ShapeTest, Invalid_Output_Type_NEG)
+{
+  Shape input_shape{1, 3};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  ShapeParams params{};
+  params.out_type = loco::DataType::FLOAT32;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.cpp
new file mode 100644
index 000000000..2fe2c5471
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "Utils.h"
+#include "PALSlice.h"
+
+#include <cassert>
+#include <cstring>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+const int max_dim = 4;
+
+Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
+  : Kernel({input, begin, size}, {output})
+{
+}
+
+template <typename T>
+Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
+{
+  Shape output_shape = Shape(input->shape().num_dims());
+  for (int idx = 0; idx < input->shape().num_dims(); idx++)
+  {
+    T size_value = getTensorData<T>(size)[idx];
+    if (size_value < 0)
+    {
+      if (size_value != -1)
+      {
+        throw std::runtime_error("Invalid size.");
+      }
+      size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
+    }
+    else
+    {
+      if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
+      {
+        throw std::runtime_error("Invalid begin and size.");
+      }
+    }
+    output_shape.dim(idx) = static_cast<int>(size_value);
+  }
+  return output_shape;
+}
+
+template <typename T>
+void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
+                            std::vector<int> *begins, std::vector<int> *sizes)
+{
+  for (int idx = dimensions - 1; idx >= 0; --idx)
+  {
+    begins->push_back(getTensorData<T>(begin)[idx]);
+    sizes->push_back(getTensorData<T>(size)[idx]);
+  }
+}
+
+void Slice::configure()
+{
+  assert(input()->element_type() == output()->element_type());
+  assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
+  assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
+  assert(begin()->shape().num_dims() == 1);
+  assert(size()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() <= max_dim);
+
+  if (begin()->element_type() == DataType::S32)
+  {
+    output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Slice::execute() const
+{
+  std::vector<int> begins;
+  begins.reserve(max_dim);
+  std::vector<int> sizes;
+  sizes.reserve(max_dim);
+  if (begin()->element_type() == DataType::S32)
+  {
+    getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported begin type.");
+  }
+  for (int i = input()->shape().num_dims(); i < max_dim; ++i)
+  {
+    begins.push_back(0);
+    sizes.push_back(1);
+  }
+
+  assert(begins.size() == 4);
+  assert(sizes.size() == 4);
+  tflite::SliceParams op_params{};
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; i++)
+  {
+    op_params.begin[i] = begins[3 - i];
+    op_params.size[i] = sizes[3 - i];
+  }
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::Slice(op_params, getTensorShape(input()), getTensorData<float>(input()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::Slice(op_params, getTensorShape(input()),
+                                  getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                  getTensorData<uint8_t>(output()));
+      break;
+    case DataType::S8:
+      luci_interpreter_pal::Slice(op_params, getTensorShape(input()),
+                                  getTensorData<int8_t>(input()), getTensorShape(output()),
+                                  getTensorData<int8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported input type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Slice.h b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.h
new file mode 100644
index 000000000..23c359608
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
+#define LUCI_INTERPRETER_KERNELS_SLICE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Slice : public Kernel
+{
+public:
+  Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *begin() const { return _inputs[1]; }
+  const Tensor *size() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.test.cpp
new file mode 100644
index 000000000..517982990
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Slice.test.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class SliceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int8_t>;
+TYPED_TEST_SUITE(SliceTest, DataTypes);
+
+TYPED_TEST(SliceTest, SimpleTest)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  Shape input_shape{3, 2, 3, 1};
+  std::vector<int32_t> begin_data{1, 0, 0, 0};
+  Shape begin_shape{4};
+  std::vector<int32_t> size_data{2, 1, -1, 1};
+  Shape size_shape{4};
+  std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
+  std::vector<int32_t> output_shape{2, 1, 3, 1};
+
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.cpp
new file mode 100644
index 000000000..c230aaa70
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Softmax.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+#include "PALSoftmax.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Softmax::Softmax(const Tensor *input, Tensor *output, const SoftmaxParams &params)
+  : KernelWithParams<SoftmaxParams>({input}, {output}, params)
+{
+}
+
+void Softmax::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= 1);
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::S8 || output()->zero_point() == 0);
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::U8 ||
+                           output()->zero_point() == std::numeric_limits<int8_t>::min());
+    tflite::SoftmaxParams op_params{};
+    op_params.table = _table;
+    luci_interpreter_pal::PopulateSoftmaxLookupTable(&op_params, input()->scale(), params().beta);
+  }
+  output()->resize(input()->shape());
+}
+
+void Softmax::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S8:
+      evalQuantized<int8_t>();
+      break;
+    case DataType::U8:
+      evalQuantized<uint8_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Softmax::evalFloat() const
+{
+  tflite::SoftmaxParams op_params{};
+  op_params.beta = params().beta;
+
+  tflite::reference_ops::Softmax(op_params, getTensorShape(input()), getTensorData<float>(input()),
+                                 getTensorShape(output()), getTensorData<float>(output()));
+}
+
+template <typename T> void Softmax::evalQuantized() const
+{
+  tflite::SoftmaxParams op_params{};
+  op_params.table = const_cast<float *>(_table);
+  op_params.zero_point = output()->zero_point();
+  op_params.scale = output()->scale();
+  luci_interpreter_pal::InitializeParams(&op_params, input()->scale(), params().beta);
+  luci_interpreter_pal::Softmax(op_params, getTensorShape(input()), getTensorData<T>(input()),
+                                getTensorShape(output()), getTensorData<T>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.h b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.h
new file mode 100644
index 000000000..1f281df1c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SOFTMAX_H
+#define LUCI_INTERPRETER_KERNELS_SOFTMAX_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Softmax : public KernelWithParams<SoftmaxParams>
+{
+public:
+  Softmax(const Tensor *input, Tensor *output, const SoftmaxParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalQuantized() const;
+
+  float _table[256];
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SOFTMAX_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.test.cpp
new file mode 100644
index 000000000..08e70672d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Softmax.test.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Softmax.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> constexpr loco::DataType toLocoDataType();
+
+template <> constexpr loco::DataType toLocoDataType<float>() { return loco::DataType::FLOAT32; }
+
+template <> constexpr loco::DataType toLocoDataType<uint8_t>() { return loco::DataType::U8; }
+
+template <> constexpr loco::DataType toLocoDataType<int8_t>() { return loco::DataType::S8; }
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<toLocoDataType<T>()>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(toLocoDataType<T>());
+
+  SoftmaxParams params{};
+  params.beta = 0.1;
+
+  Softmax kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::pair<float, int32_t> input_quant_param =
+    quantizationParams<T>(std::min<float>(std::min<float>(input_data), 0.f),
+                          std::max<float>(std::max<float>(input_data), 0.f));
+  std::pair<float, int32_t> output_quant_param =
+    quantizationParams<T>(std::min<float>(std::min<float>(output_data), 0.f),
+                          std::max<float>(std::max<float>(output_data), 0.f));
+  Tensor input_tensor = makeInputTensor<toLocoDataType<T>()>(input_shape, input_quant_param.first,
+                                                             input_quant_param.second, input_data,
+                                                             memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(toLocoDataType<T>(), output_quant_param.first, output_quant_param.second);
+
+  SoftmaxParams params{};
+  params.beta = 0.1;
+
+  Softmax kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+}
+
+template <typename T> class SoftmaxTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int8_t>;
+TYPED_TEST_SUITE(SoftmaxTest, DataTypes);
+
+TYPED_TEST(SoftmaxTest, Simple)
+{
+  Check<TypeParam>({2, 1, 2, 3}, {2, 1, 2, 3},
+                   {
+                     5, -9, 8,  //
+                     -7, 2, -4, //
+                     1, -2, 9,  //
+                     3, -6, -1, //
+                   },
+                   {
+                     0.38514, 0.09497, 0.51989, //
+                     0.20792, 0.51141, 0.28067, //
+                     0.25212, 0.18678, 0.56110, //
+                     0.48149, 0.19576, 0.32275, //
+                   });
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.cpp
new file mode 100644
index 000000000..630cd38c4
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SpaceToBatchND.h"
+#include "kernels/Utils.h"
+
+#include "PALSpaceToBatchND.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
+
+} // namespace
+
+SpaceToBatchND::SpaceToBatchND(const Tensor *input, const Tensor *block_shape,
+                               const Tensor *paddings, Tensor *output)
+  : Kernel({input, block_shape, paddings}, {output})
+{
+}
+
+void SpaceToBatchND::configure()
+{
+  const auto *block_shape_data = block_shape()->data<int32_t>();
+  const auto *paddings_data = paddings()->data<int32_t>();
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= kInputMinDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= kInputMaxDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  int spatial_dims_num = input()->shape().num_dims() - 2;
+
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().dim(0) == spatial_dims_num);
+
+  LUCI_INTERPRETER_CHECK(paddings()->shape().num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(paddings()->shape().dim(0) == spatial_dims_num);
+  LUCI_INTERPRETER_CHECK(paddings()->shape().dim(1) == 2);
+
+  Shape output_shape = Shape(input()->shape().num_dims());
+  int output_batch_size = input()->shape().dim(0);
+  for (int i = 0; i < spatial_dims_num; ++i)
+  {
+    int final_dim_size =
+      (input()->shape().dim(i + 1) + paddings_data[i * 2] + paddings_data[i * 2 + 1]);
+    LUCI_INTERPRETER_CHECK(final_dim_size % block_shape_data[i] == 0);
+    output_shape.dim(i + 1) = final_dim_size / block_shape_data[i];
+    output_batch_size = output_batch_size * block_shape_data[i];
+  }
+  output_shape.dim(0) = output_batch_size;
+  output_shape.dim(input()->shape().num_dims() - 1) =
+    input()->shape().dim(input()->shape().num_dims() - 1);
+  output()->resize(output_shape);
+}
+
+void SpaceToBatchND::execute() const
+{
+  switch (input()->element_type())
+  {
+    tflite::SpaceToBatchParams op_params;
+    case DataType::FLOAT32:
+      op_params.output_offset = 0;
+      luci_interpreter_pal::SpaceToBatchND(
+        op_params, getTensorShape(input()), getTensorData<float>(input()),
+        getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
+        getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
+        getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      op_params.output_offset = output()->zero_point();
+      luci_interpreter_pal::SpaceToBatchND(
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+        getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
+        getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.h b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.h
new file mode 100644
index 000000000..0893003bb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SpaceToBatchND : public Kernel
+{
+public:
+  SpaceToBatchND(const Tensor *input, const Tensor *block_shape, const Tensor *paddings,
+                 Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *block_shape() const { return _inputs[1]; }
+  const Tensor *paddings() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
new file mode 100644
index 000000000..3a8b0a812
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SpaceToBatchND.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> block_shape_shape,
+           std::initializer_list<int32_t> paddings_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> block_shape_data,
+           std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>(paddings_shape, paddings_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <>
+void Check<uint8_t>(
+  std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> block_shape_shape,
+  std::initializer_list<int32_t> paddings_shape, std::initializer_list<int32_t> output_shape,
+  std::initializer_list<float> input_data, std::initializer_list<int32_t> block_shape_data,
+  std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::pair<float, int32_t> input_quant_param =
+    quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>(paddings_shape, paddings_data, memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, input_quant_param.first, input_quant_param.second);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T> class SpaceToBatchNDTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(SpaceToBatchNDTest, DataTypes);
+
+TYPED_TEST(SpaceToBatchNDTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{1, 5, 2, 1}, /*block_shape_shape=*/{2},
+                   /*paddings_shape=*/{2, 2},
+                   /*output_shape=*/{6, 2, 2, 1},
+                   /*input_data=*/{-1.0, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 1.0},
+                   /*block_shape_data=*/{3, 2}, /*paddings_data=*/{1, 0, 2, 0},
+                   /*output_data=*/{0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -1.0, 0, -0.7,
+                                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 1.0});
+}
+
+TEST(SpaceToBatchNDTest, Invalid_Shape_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.cpp
new file mode 100644
index 000000000..7c29e8cb0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SpaceToDepth.h"
+#include "Utils.h"
+#include "PALSpaceToDepth.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+SpaceToDepth::SpaceToDepth(const Tensor *input, Tensor *output, const SpaceToDepthParams &params)
+  : KernelWithParams<SpaceToDepthParams>({input}, {output}, params)
+{
+}
+
+void SpaceToDepth::configure()
+{
+  assert(input()->shape().num_dims() == 4);
+  assert(output()->element_type() == DataType::FLOAT32 ||
+         output()->element_type() == DataType::U8 || output()->element_type() == DataType::S8 ||
+         output()->element_type() == DataType::S32 || output()->element_type() == DataType::S64);
+  assert(input()->element_type() == output()->element_type());
+
+  const int block_size = params().block_size;
+  const int32_t input_height = input()->shape().dim(1);
+  const int32_t input_width = input()->shape().dim(2);
+  int32_t output_height = input_height / block_size;
+  int32_t output_width = input_width / block_size;
+
+  assert(input_height == output_height * block_size);
+  assert(input_width == output_width * block_size);
+
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = output_height;
+  output_shape.dim(2) = output_width;
+  output_shape.dim(3) = input()->shape().dim(3) * block_size * block_size;
+
+  output()->resize(output_shape);
+}
+
+void SpaceToDepth::execute() const
+{
+  tflite::SpaceToDepthParams op_params{};
+  op_params.block_size = params().block_size;
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      luci_interpreter_pal::SpaceToDepth(op_params, getTensorShape(input()),
+                                         getTensorData<float>(input()), getTensorShape(output()),
+                                         getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      luci_interpreter_pal::SpaceToDepth(op_params, getTensorShape(input()),
+                                         getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                         getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.h b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.h
new file mode 100644
index 000000000..e66316b11
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPACETODEPTH_H
+#define LUCI_INTERPRETER_KERNELS_SPACETODEPTH_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SpaceToDepth : public KernelWithParams<SpaceToDepthParams>
+{
+public:
+  SpaceToDepth(const Tensor *input, Tensor *output, const SpaceToDepthParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPACETODEPTH_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
new file mode 100644
index 000000000..4af488618
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SpaceToDepth.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class SpaceToDepthTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(SpaceToDepthTest, DataTypes);
+
+TYPED_TEST(SpaceToDepthTest, SimpleCase)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  constexpr DataType element_type = getElementType<TypeParam>();
+  std::vector<TypeParam> input_data{1, 5, 6, 7, 2, 3, 4, 8};
+  Shape input_shape{1, 2, 2, 2};
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  std::vector<TypeParam> output_data{1, 5, 6, 7, 2, 3, 4, 8};
+  std::vector<int32_t> output_shape{1, 1, 1, 8};
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  SpaceToDepthParams params{};
+  params.block_size = 2;
+
+  SpaceToDepth kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Split.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Split.cpp
new file mode 100644
index 000000000..1a563f307
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Split.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Split.h"
+
+#include "Utils.h"
+
+#include "PALSplit.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Split::Split(const Tensor *axis, const Tensor *input, std::vector<Tensor *> outputs)
+  : Kernel({axis, input}, std::move(outputs))
+{
+}
+
+void Split::configure()
+{
+  assert(axis()->shape().num_elements() == 1);
+  _axis_value = getTensorData<int32_t>(axis())[0];
+  if (_axis_value < 0)
+    _axis_value += input()->shape().num_dims();
+  assert(_axis_value >= 0 && _axis_value < input()->shape().num_dims());
+
+  const int32_t input_size = input()->shape().dim(_axis_value);
+  assert(input_size % _outputs.size() == 0);
+  const int32_t slice_size = input_size / _outputs.size();
+
+  Shape output_shape = input()->shape();
+  output_shape.dim(_axis_value) = slice_size;
+  for (Tensor *output : _outputs)
+  {
+    output->resize(output_shape);
+  }
+}
+
+void Split::execute() const
+{
+  tflite::SplitParams params{};
+  params.num_split = _outputs.size();
+  params.axis = _axis_value;
+
+#define TF_LITE_SPLIT(scalar)                                                                    \
+  {                                                                                              \
+    VectorOfTensors<scalar, false> all_outputs(_outputs);                                        \
+    luci_interpreter_pal::Split(params, getTensorShape(input()), getTensorData<scalar>(input()), \
+                                all_outputs.shapes(), all_outputs.data());                       \
+  }
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      TF_LITE_SPLIT(float);
+      break;
+    case DataType::U8:
+      TF_LITE_SPLIT(uint8_t);
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+#undef TF_LITE_SPLIT
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Split.h b/compiler/luci-micro/luci-interpreter/src/kernels/Split.h
new file mode 100644
index 000000000..9542b1e56
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Split.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPLIT_H
+#define LUCI_INTERPRETER_KERNELS_SPLIT_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Split : public Kernel
+{
+public:
+  Split(const Tensor *axis, const Tensor *input, std::vector<Tensor *> outputs);
+
+  const Tensor *axis() const { return _inputs[0]; }
+  const Tensor *input() const { return _inputs[1]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  int32_t _axis_value{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPLIT_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Split.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Split.test.cpp
new file mode 100644
index 000000000..283cd9aa9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Split.test.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Split.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(int axis, int num_splits, std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
+           std::vector<std::vector<T>> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  constexpr DataType element_type = getElementType<T>();
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis}, memory_manager.get());
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensors.emplace_back(makeOutputTensor(element_type));
+  }
+
+  std::vector<Tensor *> output_tensor_ptrs(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensor_ptrs[i] = &output_tensors[i];
+  }
+
+  Split kernel(&axis_tensor, &input_tensor, std::move(output_tensor_ptrs));
+  kernel.configure();
+  for (int i = 0; i < num_splits; ++i)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
+  kernel.execute();
+
+  for (int i = 0; i < num_splits; ++i)
+  {
+    EXPECT_THAT(extractTensorData<T>(output_tensors[i]),
+                ::testing::ElementsAreArray(output_data[i]));
+  }
+}
+
+template <typename T> class SplitTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(SplitTest, DataTypes);
+
+TYPED_TEST(SplitTest, FourDimensional)
+{
+  Check<TypeParam>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                   {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                   {
+                     {1, 2, 3, 4, 5, 6, 7, 8},        //
+                     {9, 10, 11, 12, 13, 14, 15, 16}, //
+                   });
+  Check<TypeParam>(
+    /*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 3, 4, 9, 10, 11, 12},  //
+      {5, 6, 7, 8, 13, 14, 15, 16}, //
+    });
+  Check<TypeParam>(
+    /*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 5, 6, 9, 10, 13, 14},  //
+      {3, 4, 7, 8, 11, 12, 15, 16}, //
+    });
+  Check<TypeParam>(
+    /*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 3, 5, 7, 9, 11, 13, 15},  //
+      {2, 4, 6, 8, 10, 12, 14, 16}, //
+    });
+}
+
+TYPED_TEST(SplitTest, OneDimensional)
+{
+  Check<TypeParam>(
+    /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+    {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TYPED_TEST(SplitTest, NegativeAxis)
+{
+  Check<TypeParam>(
+    /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 3, 4, 5, 6, 7, 8}, //
+      {9, 10, 11, 12, 13, 14, 15, 16},
+    });
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.cpp
new file mode 100644
index 000000000..aa6820889
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SplitV.h"
+
+#include "Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+SplitV::SplitV(const Tensor *input, const Tensor *size_splits, const Tensor *axis,
+               std::vector<Tensor *> outputs)
+  : Kernel({input, size_splits, axis}, std::move(outputs))
+{
+}
+
+void SplitV::configure()
+{
+  assert(axis()->shape().num_elements() == 1);
+  _axis_value = getTensorData<int32_t>(axis())[0];
+  if (_axis_value < 0)
+    _axis_value += input()->shape().num_dims();
+  assert(_axis_value >= 0 && _axis_value < input()->shape().num_dims());
+
+  auto num_split = static_cast<int32_t>(_outputs.size());
+  auto sizes_data = getTensorData<int32_t>(size_splits());
+
+  assert(size_splits()->shape().num_dims() == 1);
+
+  int32_t sum = 0;
+  const auto num_dims_size_spits = size_splits()->shape().dim(0);
+  int32_t count_neg_dim = 0;
+
+  for (int32_t i = 0; i < num_dims_size_spits - 1; ++i)
+  {
+    if (sizes_data[i] != -1)
+    {
+      sum += sizes_data[i];
+    }
+    else
+    {
+      count_neg_dim++;
+    }
+  }
+  assert(count_neg_dim < 2);
+  assert(size_splits()->shape().num_elements() == num_split);
+
+  auto output_shape = input()->shape();
+  for (int32_t i = 0; i < num_split; ++i)
+  {
+    if (sizes_data[i] == -1)
+    {
+      output_shape.dim(_axis_value) = input()->shape().dim(_axis_value) - sum;
+    }
+    else
+    {
+      output_shape.dim(_axis_value) = sizes_data[i];
+    }
+    _outputs[i]->resize(output_shape);
+  }
+}
+
+void SplitV::execute() const
+{
+  tflite::SplitParams params{};
+  params.num_split = _outputs.size();
+  params.axis = _axis_value;
+
+#define TF_LITE_SPLIT(scalar)                                                                     \
+  {                                                                                               \
+    VectorOfTensors<scalar, false> all_outputs(_outputs);                                         \
+    tflite::optimized_ops::Split(params, getTensorShape(input()), getTensorData<scalar>(input()), \
+                                 all_outputs.shapes(), all_outputs.data());                       \
+  }
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      TF_LITE_SPLIT(float);
+      break;
+    case DataType::U8:
+      TF_LITE_SPLIT(uint8_t);
+      break;
+    case DataType::S16:
+      TF_LITE_SPLIT(int16_t);
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+#undef TF_LITE_SPLIT
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.h b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.h
new file mode 100644
index 000000000..92f6288fb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPLIT_V_H
+#define LUCI_INTERPRETER_KERNELS_SPLIT_V_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SplitV : public Kernel
+{
+public:
+  SplitV(const Tensor *input, const Tensor *size_splits, const Tensor *axis,
+         std::vector<Tensor *> outputs);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size_splits() const { return _inputs[1]; }
+  const Tensor *axis() const { return _inputs[2]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  int32_t _axis_value{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPLIT_V_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.test.cpp
new file mode 100644
index 000000000..035bc2122
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SplitV.test.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SplitV.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(int axis, std::initializer_list<int32_t> splits_size,
+           std::initializer_list<int32_t> input_shape, std::initializer_list<T> input_data,
+           std::vector<std::vector<T>> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+
+  auto num_splits = static_cast<int32_t>(splits_size.size());
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor sizes_tensor =
+    makeInputTensor<DataType::S32>({num_splits}, splits_size, memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis}, memory_manager.get());
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensors.emplace_back(makeOutputTensor(element_type));
+  }
+
+  std::vector<Tensor *> output_tensor_ptrs(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensor_ptrs[i] = &output_tensors[i];
+  }
+
+  SplitV kernel(&input_tensor, &sizes_tensor, &axis_tensor, std::move(output_tensor_ptrs));
+  kernel.configure();
+  for (int i = 0; i < num_splits; ++i)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
+  kernel.execute();
+
+  for (int i = 0; i < num_splits; ++i)
+  {
+    auto tmp = extractTensorData<T>(output_tensors[i]);
+    EXPECT_THAT(extractTensorData<T>(output_tensors[i]),
+                ::testing::ElementsAreArray(output_data[i]));
+  }
+}
+
+template <typename T> class SplitVTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int16_t>;
+TYPED_TEST_SUITE(SplitVTest, DataTypes);
+
+TYPED_TEST(SplitVTest, ThreeDimensional)
+{
+  Check<TypeParam>(
+    /*axis=*/0, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 2, 3, 4, 5, 6, 7, 8, 9},                                             //
+      {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27} //
+    });
+  Check<TypeParam>(
+    /*axis=*/1, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 2, 3, 10, 11, 12, 19, 20, 21},                                 //
+      {4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27} //
+    });
+  Check<TypeParam>(
+    /*axis=*/2, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 4, 7, 10, 13, 16, 19, 22, 25},                                 //
+      {2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27} //
+    });
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.cpp
new file mode 100644
index 000000000..46e9fc9ad
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sqrt.h"
+#include "kernels/Utils.h"
+
+#include <stdexcept>
+#include <cmath>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Sqrt::Sqrt(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Sqrt::configure()
+{
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Input/output tensor data type mismatch.");
+  }
+  output()->resize(input()->shape());
+}
+
+void Sqrt::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Sqrt::evalFloat() const
+{
+  auto in = getTensorData<float>(input());
+  auto out = getTensorData<float>(output());
+  auto size = getTensorShape(input()).FlatSize();
+  for (auto i = in; i != in + size; ++i)
+  {
+    *out = std::sqrt(*i);
+    ++out;
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.h b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.h
new file mode 100644
index 000000000..4034655ed
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SQRT_H
+#define LUCI_INTERPRETER_KERNELS_SQRT_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Sqrt : public Kernel
+{
+public:
+  Sqrt(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SQRT_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.test.cpp
new file mode 100644
index 000000000..96835fbfc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sqrt.test.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sqrt.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Sqrt kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(SqrtTest, SimpleSqrt)
+{
+  Check(
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      0, 8, 2, 4,    //
+      3, 7, 10, 0.3, //
+    },
+    /*output_data=*/
+    {
+      0.0, 2.8284271, 1.4142136, 2,                //
+      1.7320508, 2.6457513, 3.1622777, 0.54772256, //
+    });
+}
+
+TEST(SqrtTest, Input_Output_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  Sqrt kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(SqrtTest, Invalid_Input_Type_NEG)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  Sqrt kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Square.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Square.cpp
new file mode 100644
index 000000000..bc71905c1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Square.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Square.h"
+#include "kernels/Utils.h"
+
+#include <stdexcept>
+#include <cmath>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Square::Square(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Square::configure()
+{
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Input/output tensor data type mismatch.");
+  }
+  output()->resize(input()->shape());
+}
+
+void Square::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Square::evalFloat() const
+{
+  auto in = getTensorData<float>(input());
+  auto out = getTensorData<float>(output());
+  auto size = getTensorShape(input()).FlatSize();
+  for (auto i = in; i != in + size; ++i)
+  {
+    *out = (*i) * (*i);
+    ++out;
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Square.h b/compiler/luci-micro/luci-interpreter/src/kernels/Square.h
new file mode 100644
index 000000000..73ed5a707
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Square.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SQUARE_H
+#define LUCI_INTERPRETER_KERNELS_SQUARE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Square : public Kernel
+{
+public:
+  Square(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SQUARE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Square.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Square.test.cpp
new file mode 100644
index 000000000..51662dea7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Square.test.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Square.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(SquareTest, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Square kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{1.0, 0.0, 1.0, 121.0, 4.0, 2.0736};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.cpp
new file mode 100644
index 000000000..3bafeba4a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SquaredDifference.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+SquaredDifference::SquaredDifference(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void SquaredDifference::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void SquaredDifference::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalSquaredDifference<float>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void SquaredDifference::evalSquaredDifference() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()), [](T x, T y) {
+                          const T difference = x - y;
+                          return difference * difference;
+                        });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.h b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.h
new file mode 100644
index 000000000..9327caf93
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
+#define LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SquaredDifference : public Kernel
+{
+public:
+  SquaredDifference(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalSquaredDifference() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.test.cpp
new file mode 100644
index 000000000..2819c01e2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/SquaredDifference.test.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SquaredDifference.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(SquaredDifferenceTest, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{4.0, 0.0, 4.0, 1.0, 1.0, 0.0001};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(SquaredDifferenceTest, FloatBroadcast)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape1{3, 1, 2};
+  Shape input_shape2{1};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{1.0};
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape1, input_data1, memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape2, input_data2, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.0, 1.0, 4.0, 100.0, 9.0, 5.9536};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.cpp
new file mode 100644
index 000000000..4a75518c7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Squeeze.h"
+
+#include "kernels/Utils.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Squeeze::Squeeze(const Tensor *input, Tensor *output, const SqueezeParams &params)
+  : KernelWithParams<SqueezeParams>({input}, {output}, params)
+{
+}
+
+void Squeeze::configure()
+{
+  int input_num_dims = input()->shape().num_dims();
+  int num_squeeze_dims = params().squeeze_dims.size();
+  assert(input_num_dims <= 8);
+  bool should_squeeze[8] = {false};
+  int num_squeezed_dims = 0;
+  if (num_squeeze_dims == 0)
+  {
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      if (input()->shape().dim(idx) == 1)
+      {
+        should_squeeze[idx] = true;
+        ++num_squeezed_dims;
+      }
+    }
+  }
+  else
+  {
+    for (int idx = 0; idx < num_squeeze_dims; ++idx)
+    {
+      int current = params().squeeze_dims[idx] < 0 ? params().squeeze_dims[idx] + input_num_dims
+                                                   : params().squeeze_dims[idx];
+      assert(current >= 0 && current < input_num_dims && input()->shape().dim(current) == 1);
+      if (!should_squeeze[current])
+        ++num_squeezed_dims;
+      should_squeeze[current] = true;
+    }
+  }
+  Shape output_shape(input_num_dims - num_squeezed_dims);
+  for (int in_idx = 0, out_idx = 0; in_idx < input_num_dims; ++in_idx)
+  {
+    if (!should_squeeze[in_idx])
+    {
+      output_shape.dim(out_idx++) = input()->shape().dim(in_idx);
+    }
+  }
+  output()->resize(output_shape);
+}
+
+void Squeeze::execute() const
+{
+  assert(input()->shape().num_elements() == output()->shape().num_elements());
+
+  const auto *input_data = input()->data<void>();
+  auto *output_data = output()->data<void>();
+  std::memcpy(output_data, input_data,
+              getDataTypeSize(input()->element_type()) * input()->shape().num_elements());
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.h b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.h
new file mode 100644
index 000000000..687af5158
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SQUEEZE_H
+#define LUCI_INTERPRETER_KERNELS_SQUEEZE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Squeeze : public KernelWithParams<SqueezeParams>
+{
+public:
+  Squeeze(const Tensor *input, Tensor *output, const SqueezeParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SQUEEZE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.test.cpp
new file mode 100644
index 000000000..1bc0b6459
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Squeeze.test.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Squeeze.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T> input_data, std::initializer_list<T> output_data,
+           std::initializer_list<int32_t> squeeze_dims)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  SqueezeParams params{};
+  params.squeeze_dims = squeeze_dims;
+
+  Squeeze kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class SqueezeTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(SqueezeTest, DataTypes);
+
+TYPED_TEST(SqueezeTest, TotalTest)
+{
+  Check<TypeParam>(
+    /*input_shape=*/{1, 24, 1}, /*output_shape=*/{24},
+    /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
+    /*output_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                     13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
+    {-1, 0});
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.cpp
new file mode 100644
index 000000000..a8730d861
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/StridedSlice.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/strided_slice.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+StridedSlice::StridedSlice(const Tensor *input, const Tensor *begin, const Tensor *end,
+                           const Tensor *strides, Tensor *output, const StridedSliceParams &params)
+  : KernelWithParams<StridedSliceParams>({input, begin, end, strides}, {output}, params)
+{
+}
+
+void StridedSlice::configure()
+{
+  assert(begin()->shape().num_dims() == 1);
+  assert(end()->shape().num_dims() == 1);
+  assert(strides()->shape().num_dims() == 1);
+  assert(input()->element_type() == output()->element_type());
+  assert(begin()->element_type() == DataType::S32);
+  assert(end()->element_type() == DataType::S32);
+  assert(strides()->element_type() == DataType::S32);
+  assert(input()->shape().num_dims() <= 4);
+  if (params().ellipsis_mask != 0)
+  {
+    throw std::runtime_error("ellipsis_mask is not implemented yet.");
+  }
+  if (params().new_axis_mask != 0)
+  {
+    throw std::runtime_error("new_axis_mask is not implemented yet.");
+  }
+  if (input()->element_type() == DataType::U8)
+  {
+    assert(input()->scale() == output()->scale());
+    assert(input()->zero_point() == output()->zero_point());
+  }
+  tflite::StridedSliceParams op_params{};
+  op_params.start_indices_count = input()->shape().num_dims();
+  op_params.stop_indices_count = input()->shape().num_dims();
+  op_params.strides_count = input()->shape().num_dims();
+
+  for (int i = 0; i < input()->shape().num_dims(); i++)
+  {
+    op_params.start_indices[i] = getTensorData<int32_t>(begin())[i];
+    op_params.stop_indices[i] = getTensorData<int32_t>(end())[i];
+    op_params.strides[i] = getTensorData<int32_t>(strides())[i];
+  }
+  op_params.begin_mask = params().begin_mask;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = params().end_mask;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = params().shrink_axis_mask;
+  std::vector<int32_t> output_shape_vector;
+  for (int i = 0; i < input()->shape().num_dims(); i++)
+  {
+    int idx = input()->shape().num_dims() - i - 1;
+    int32_t stride = getTensorData<int32_t>(strides())[idx];
+    assert(stride != 0);
+    int32_t begin = ::tflite::strided_slice::StartForAxis(op_params, getTensorShape(input()), idx);
+    int32_t end =
+      ::tflite::strided_slice::StopForAxis(op_params, getTensorShape(input()), idx, begin);
+
+    const bool shrink_axis = params().shrink_axis_mask & (1 << idx);
+    if (shrink_axis)
+    {
+      end = begin + 1;
+    }
+
+    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!shrink_axis)
+    {
+      output_shape_vector.push_back(dim_shape);
+    }
+  }
+  Shape output_shape = Shape(output_shape_vector.size());
+  for (size_t i = 0; i < output_shape_vector.size(); i++)
+  {
+    output_shape.dim(i) = output_shape_vector[output_shape_vector.size() - i - 1];
+  }
+  output()->resize(output_shape);
+}
+
+void StridedSlice::execute() const
+{
+  tflite::StridedSliceParams op_params{};
+  op_params.start_indices_count = input()->shape().num_dims();
+  op_params.stop_indices_count = input()->shape().num_dims();
+  op_params.strides_count = input()->shape().num_dims();
+
+  for (int i = 0; i < input()->shape().num_dims(); i++)
+  {
+    op_params.start_indices[i] = getTensorData<int32_t>(begin())[i];
+    op_params.stop_indices[i] = getTensorData<int32_t>(end())[i];
+    op_params.strides[i] = getTensorData<int32_t>(strides())[i];
+  }
+  op_params.begin_mask = params().begin_mask;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = params().end_mask;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = params().shrink_axis_mask;
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::StridedSlice(op_params, getTensorShape(input()),
+                                          getTensorData<float>(input()), getTensorShape(output()),
+                                          getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::reference_ops::StridedSlice(op_params, getTensorShape(input()),
+                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                          getTensorData<uint8_t>(output()));
+      break;
+    case DataType::S32:
+      tflite::reference_ops::StridedSlice(op_params, getTensorShape(input()),
+                                          getTensorData<int32_t>(input()), getTensorShape(output()),
+                                          getTensorData<int32_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.h b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.h
new file mode 100644
index 000000000..fc96893a7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_STRIDEDSLICE_H
+#define LUCI_INTERPRETER_KERNELS_STRIDEDSLICE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class StridedSlice : public KernelWithParams<StridedSliceParams>
+{
+public:
+  StridedSlice(const Tensor *input, const Tensor *begin, const Tensor *end, const Tensor *strides,
+               Tensor *output, const StridedSliceParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *begin() const { return _inputs[1]; }
+  const Tensor *end() const { return _inputs[2]; }
+  const Tensor *strides() const { return _inputs[3]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_STRIDEDSLICE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.test.cpp
new file mode 100644
index 000000000..399cdebed
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/StridedSlice.test.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/StridedSlice.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(StridedSliceTest, Float)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape{2, 3, 2};
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Shape begin_shape{3};
+  std::vector<int32_t> begin_data{0, 0, 0};
+  Shape end_shape{3};
+  std::vector<int32_t> end_data{1, 3, 2};
+  Shape strides_shape{3};
+  std::vector<int32_t> strides_data{1, 1, 1};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data, memory_manager.get());
+  Tensor strides_tensor =
+    makeInputTensor<DataType::S32>(strides_shape, strides_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  StridedSliceParams params{};
+  params.begin_mask = 0;
+  params.end_mask = 0;
+  params.ellipsis_mask = 0;
+  params.new_axis_mask = 0;
+  params.shrink_axis_mask = 1;
+
+  StridedSlice kernel(&input_tensor, &begin_tensor, &end_tensor, &strides_tensor, &output_tensor,
+                      params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<int32_t> output_shape{3, 2};
+  std::vector<float> output_data{1, 2, 3, 4, 5, 6};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(StridedSliceTest, Uint8)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Shape input_shape{2, 3, 2};
+  std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Shape begin_shape{3};
+  std::vector<int32_t> begin_data{0, 0, 0};
+  Shape end_shape{3};
+  std::vector<int32_t> end_data{1, 3, 2};
+  Shape strides_shape{3};
+  std::vector<int32_t> strides_data{1, 1, 1};
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, 1.0f, 0, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data, memory_manager.get());
+  Tensor strides_tensor =
+    makeInputTensor<DataType::S32>(strides_shape, strides_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0f, 0);
+
+  StridedSliceParams params{};
+  params.begin_mask = 0;
+  params.end_mask = 0;
+  params.ellipsis_mask = 0;
+  params.new_axis_mask = 0;
+  params.shrink_axis_mask = 1;
+
+  StridedSlice kernel(&input_tensor, &begin_tensor, &end_tensor, &strides_tensor, &output_tensor,
+                      params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<int32_t> output_shape{3, 2};
+  std::vector<float> output_data{1, 2, 3, 4, 5, 6};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sub.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.cpp
new file mode 100644
index 000000000..24b6a72e5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sub.h"
+#include "kernels/Utils.h"
+
+#include "PALSub.h"
+
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Sub::Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubParams &params)
+  : KernelWithParams<SubParams>({input1, input2}, {output}, params)
+{
+}
+
+void Sub::configure()
+{
+  LUCI_INTERPRETER_CHECK(!(input1()->element_type() != input2()->element_type()))
+  LUCI_INTERPRETER_CHECK(!(input1()->element_type() != output()->element_type()))
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void Sub::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::S64:
+      evalInteger<int64_t>();
+      break;
+    case DataType::S32:
+      evalInteger<int32_t>();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Sub::evalFloat() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<float>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastSubSlow(
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+  }
+  else
+  {
+    luci_interpreter_pal::Sub(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                              getTensorShape(input2()), getTensorData<float>(input2()),
+                              getTensorShape(output()), getTensorData<float>(output()));
+  }
+}
+
+template <typename T> void Sub::evalInteger() const
+{
+  tflite::ArithmeticParams params{};
+  fillArithmeticActivationRange<T>(params, _params.activation);
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastSubSlow(
+      params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
+      getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Sub(params, getTensorShape(input1()), getTensorData<T>(input1()),
+                               getTensorShape(input2()), getTensorData<T>(input2()),
+                               getTensorShape(output()), getTensorData<T>(output()));
+  }
+}
+
+void Sub::evalQuantized() const
+{
+  const auto input1_scale = static_cast<double>(input1()->scale());
+  const auto input2_scale = static_cast<double>(input2()->scale());
+  const auto output_scale = static_cast<double>(output()->scale());
+
+  const int left_shift = 20;
+  const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
+  const double real_input1_multiplier = input1_scale / twice_max_input_scale;
+  const double real_input2_multiplier = input2_scale / twice_max_input_scale;
+  const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
+
+  int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
+  int input1_shift{}, input2_shift{}, output_shift{};
+  quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
+  quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
+  quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ArithmeticParams params{};
+  params.left_shift = left_shift;
+  // The kernel expects inputs' zero points to be negated.
+  params.input1_offset = -input1()->zero_point(); // Note the '-'.
+  params.input1_multiplier = input1_multiplier;
+  params.input1_shift = input1_shift;
+  params.input2_offset = -input2()->zero_point(); // Note the '-'.
+  params.input2_multiplier = input2_multiplier;
+  params.input2_shift = input2_shift;
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
+    getTensorShape(input1()), getTensorShape(input2()), &params);
+
+  if (need_broadcast)
+  {
+    tflite::reference_ops::BroadcastSubSlow(
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+  else
+  {
+    tflite::reference_ops::Sub(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
+                               getTensorShape(input2()), getTensorData<uint8_t>(input2()),
+                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sub.h b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.h
new file mode 100644
index 000000000..23952b3bd
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SUB_H
+#define LUCI_INTERPRETER_KERNELS_SUB_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Sub : public KernelWithParams<SubParams>
+{
+public:
+  Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubParams &params);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  template <typename T> void evalInteger() const;
+  void evalQuantized() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SUB_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Sub.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.test.cpp
new file mode 100644
index 000000000..9abafd49a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Sub.test.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Sub.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+#include <algorithm>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+using std::pair;
+using std::vector;
+using std::transform;
+using std::initializer_list;
+
+class SubTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(float min, float max)
+{
+  float kQuantizedStep = (max - min) / 255.0;
+  return kQuantizedStep;
+}
+
+TEST_F(SubTest, Uint8)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  vector<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                             1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  vector<Shape> test_shapes = {{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  vector<float> test_data = {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  vector<vector<int32_t>> output_shapes = {{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+  vector<vector<float>> output_data = {
+    {-0.5f, 2.0f,  0.1f,  1.8f,  -1.3f, 1.4f,  0.7f, 0.2f,  1.3f, 0.0f,  -0.1f, -0.4f,
+     0.6f,  -1.4f, 1.2f,  -1.6f, -0.2f, -2.0f, 1.0f, 2.5f,  1.6f, 2.3f,  0.2f,  1.9f,
+     -1.8f, -0.3f, -1.2f, -0.5f, -2.6f, -0.9f, 0.5f, -2.5f, 1.1f, -2.7f, -0.3f, -3.0f},
+    {-0.5f, 2.0f, 1.3f, 0.0f, -0.2f, -2.0f, 1.0f, 2.5f, -1.2f, -0.5f, -0.3f, -3.0f},
+    {-0.5f, 2.1f,  -0.6f, 2.0f,  0.1f,  2.7f,  0.7f, 0.3f,  0.6f,  0.2f,  1.3f,  0.9f,
+     0.6f,  -1.3f, 0.5f,  -1.4f, 1.2f,  -0.7f, 0.7f, 2.3f,  0.2f,  1.8f,  0.3f,  1.9f,
+     -2.1f, -0.5f, -2.6f, -1.0f, -2.5f, -0.9f, 0.2f, -2.7f, -0.3f, -3.0f, -0.2f, -3.0f},
+    {-0.5f, 2.1f, 0.6f, 0.2f, 1.2f, -0.7f, 0.7f, 2.3f, -2.6f, -1.0f, -0.2f, -3.0f}};
+
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
+  for (size_t i = 0; i < output_data.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor output_tensor =
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    SubParams params{};
+    params.activation = Activation::NONE;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+
+  // Inversion step for output_data, because subtract is not commutative operation
+  auto multiply = [](auto &i) {
+    transform(i.begin(), i.end(), i.begin(), [](auto &value) { return value * -1.0f; });
+  };
+  for_each(output_data.begin(), output_data.end(), multiply);
+
+  // Re-run with exchanged inputs.
+  for (size_t i = 0; i < output_data.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor output_tensor =
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+
+    SubParams params{};
+    params.activation = Activation::NONE;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(dequantizeTensorData(output_tensor),
+                FloatArrayNear(output_data[i], kQuantizedTolerance));
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST_F(SubTest, Float)
+{
+  Shape base_shape = {2, 3, 1, 2};
+  vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  vector<vector<int32_t>> output_shapes{{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+  vector<vector<float>> test_outputs = {
+    {0.0f, 2.0f, 0.1f, 1.8f, 0.0f, 1.4f, 0.7f, 0.2f, 1.3f, 0.0f, 0.0f, 0.0f,
+     0.6f, 0.0f, 1.2f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 1.6f, 2.3f, 0.2f, 1.9f,
+     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 0.0f, 1.1f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.0f, 1.3f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 0.0f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.1f, 0.0f, 2.0f, 0.1f, 2.7f, 0.7f, 0.3f, 0.6f, 0.2f, 1.3f, 0.9f,
+     0.6f, 0.0f, 0.5f, 0.0f, 1.2f, 0.0f, 0.7f, 2.3f, 0.2f, 1.8f, 0.3f, 1.9f,
+     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.1f, 0.6f, 0.2f, 1.2f, 0.0f, 0.7f, 2.3f, 0.0f, 0.0f, 0.0f, 0.0f}};
+
+  vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
+                            1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
+  vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+    SubParams params{};
+    params.activation = Activation::RELU;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
+      << "With shape number " << i;
+
+    EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
+  }
+}
+
+template <loco::DataType DType> void CheckInteger(luci_interpreter::IMemoryManager *memory_manager)
+{
+  using dtype = typename loco::DataTypeImpl<DType>::Type;
+  Shape base_shape = {2, 3, 1, 2};
+  std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<dtype>> test_outputs = {
+    {0, 1, 2, 3, 0, 0, 0, 0, 4,  1, 0, 0, 0, 0, 7,  0, 3, 0,
+     0, 2, 4, 4, 0, 0, 3, 0, 10, 0, 6, 0, 3, 0, 10, 2, 6, 0},
+    {0, 1, 4, 1, 3, 0, 0, 2, 10, 0, 6, 0},
+    {0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 4, 3, 0, 0, 3, 0, 7, 0,
+     2, 4, 0, 2, 0, 0, 8, 0, 6, 0, 1, 0, 8, 2, 6, 0, 1, 0},
+    {0, 0, 0, 0, 7, 0, 2, 4, 6, 0, 1, 0}};
+  std::vector<dtype> input1_data{-1, 2, 1, 0, 4, -5, 1, 3, 7, -1, 7, 1};
+  std::vector<dtype> input2_data{4, 1, -3, -1, 1, 6};
+  for (size_t i = 0; i < test_shapes.size(); ++i)
+  {
+    Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager);
+    Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager);
+    Tensor output_tensor = makeOutputTensor(DType);
+
+    SubParams params{};
+    params.activation = Activation::RELU;
+
+    Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    kernel.execute();
+
+    EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i])
+      << "With shape number " << i;
+  }
+};
+
+TEST_F(SubTest, SInt32)
+{
+  CheckInteger<loco::DataType::S32>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(SubTest, SInt64)
+{
+  CheckInteger<loco::DataType::S64>(_memory_manager.get());
+  SUCCEED();
+}
+
+TEST_F(SubTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SubParams params{};
+  params.activation = Activation::RELU;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(SubTest, Invalid_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  SubParams params{};
+  params.activation = Activation::RELU;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(SubTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U64);
+
+  SubParams params{};
+  params.activation = Activation::RELU;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(SubTest, Mismatching_Input_Int_Types_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S32);
+
+  SubParams params{};
+  params.activation = Activation::NONE;
+
+  Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.cpp
new file mode 100644
index 000000000..c4fa16912
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Tanh.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/tanh.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Tanh::Tanh(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Tanh::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  if (input()->element_type() == DataType::U8)
+  {
+    populateLookupTable();
+  }
+  output()->resize(input()->shape());
+}
+
+void Tanh::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      evalQuantized();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Tanh::evalFloat() const
+{
+  tflite::reference_ops::Tanh(getTensorShape(input()), getTensorData<float>(input()),
+                              getTensorShape(output()), getTensorData<float>(output()));
+}
+
+void Tanh::evalQuantized() const
+{
+  const int size = tflite::MatchingFlatSize(getTensorShape(input()), getTensorShape(output()));
+  uint8_t *output_data = getTensorData<uint8_t>(output());
+  const uint8_t *input_data = getTensorData<uint8_t>(input());
+  for (int i = 0; i < size; ++i)
+  {
+    output_data[i] = getTableValue(input_data[i]);
+  }
+}
+
+void Tanh::populateLookupTable()
+{
+  const auto input_scale = static_cast<double>(input()->scale());
+  const auto input_zero_point = static_cast<int32_t>(input()->zero_point());
+  const auto output_scale = static_cast<double>(output()->scale());
+  const auto output_zero_point = static_cast<int32_t>(output()->zero_point());
+  const float inverse_scale = 1 / output_scale;
+  int32_t maxval = std::numeric_limits<uint8_t>::max();
+  int32_t minval = std::numeric_limits<uint8_t>::min();
+  for (int32_t val = minval; val <= maxval; ++val)
+  {
+    const float dequantized = input_scale * (val - input_zero_point);
+    const float transformed = std::tanh(dequantized);
+    const float rescaled = std::round(transformed * inverse_scale);
+    const int32_t quantized = static_cast<int32_t>(rescaled + output_zero_point);
+    setTableValue(static_cast<uint8_t>(std::max(std::min(maxval, quantized), minval)),
+                  static_cast<uint8_t>(val));
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.h b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.h
new file mode 100644
index 000000000..8017c9638
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_TANH_H
+#define LUCI_INTERPRETER_KERNELS_TANH_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Tanh : public Kernel
+{
+public:
+  Tanh(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void populateLookupTable();
+  void setTableValue(uint8_t value, uint8_t idx) { _table[idx] = value; };
+  uint8_t getTableValue(uint8_t idx) const { return _table[idx]; };
+
+private:
+  uint8_t _table[256]{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_TANH_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.test.cpp
new file mode 100644
index 000000000..bfae479a9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Tanh.test.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Tanh.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class TanhTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(TanhTest, Float)
+{
+  Shape input_shape{1, 2, 4, 1};
+  std::vector<float> input_data{
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tanh kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    0,          -0.9999877, 0.9640275, 0.999329,  //
+    0.99505475, -0.9640275, 1,         0.7615941, //
+  };
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(TanhTest, Uint8)
+{
+  float kMin = -1;
+  float kMax = 127.f / 128.f;
+  float kTanhTolerance = 2 * (1. / 256);
+  std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(8 * kMin, 8 * kMax);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(kMin, kMax);
+  std::vector<float> input_data{
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 6, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+
+  Tanh kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+  };
+  std::vector<int32_t> ref_output_shape{2, 6, 4, 1};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data, kTanhTolerance));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(TanhTest, InputTypeInvalid_NEG)
+{
+  std::vector<int64_t> input_data{
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::S64>({2, 6, 4, 1}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tanh kernel(&input_tensor, &output_tensor);
+  _memory_manager->allocate_memory(output_tensor);
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
+TEST_F(TanhTest, InputOutputMismatch_NEG)
+{
+  std::vector<float> input_data{
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+  };
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 6, 4, 1}, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8);
+
+  Tanh kernel(&input_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.cpp
new file mode 100644
index 000000000..4d983adda
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/TestUtils.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace testing
+{
+
+using ::testing::FloatNear;
+using ::testing::Matcher;
+
+Tensor makeOutputTensor(DataType element_type) { return Tensor(element_type, {}, {}, ""); }
+
+Tensor makeOutputTensor(DataType element_type, float scale, int32_t zero_point)
+{
+  return Tensor(element_type, {}, {{scale}, {zero_point}}, "");
+}
+
+std::vector<float> dequantizeTensorData(const Tensor &tensor)
+{
+  if (tensor.element_type() == DataType::U8)
+  {
+    std::vector<uint8_t> data = extractTensorData<uint8_t>(tensor);
+    return dequantize(data.data(), data.size(), tensor.scale(), tensor.zero_point());
+  }
+  if (tensor.element_type() == DataType::S8)
+  {
+    std::vector<int8_t> data = extractTensorData<int8_t>(tensor);
+    return dequantize(data.data(), data.size(), tensor.scale(), tensor.zero_point());
+  }
+  else if (tensor.element_type() == DataType::S16)
+  {
+    // S16 quantization is symmetric, so zero point should be zero.
+    for (auto zp : tensor.zero_points())
+    {
+      (void)zp;
+      assert(zp == 0);
+    }
+
+    std::vector<int16_t> data = extractTensorData<int16_t>(tensor);
+    if (tensor.scales().size() == 1)
+    {
+      return dequantize(data.data(), data.size(), tensor.scale(), 0);
+    }
+
+    // quantize_dimension breaks shape into two parts:
+    // inner dimensions that contains continuous data with one quantization type
+    // outer dimensions that contains other dimensions
+    const Shape shape = tensor.shape();
+    const int32_t quantized_dimension = tensor.quantized_dimension();
+    assert(quantized_dimension < shape.num_dims());
+    size_t outer_dims_size = 1;
+    int32_t quant_dim_size = shape.dim(quantized_dimension);
+    size_t inner_dims_size = 1;
+    assert(quant_dim_size == tensor.scales().size());
+
+    for (int i = 0; i < quantized_dimension; ++i)
+      outer_dims_size *= shape.dim(i);
+    for (int i = quantized_dimension + 1; i < shape.num_dims(); ++i)
+      inner_dims_size *= shape.dim(i);
+
+    assert(shape.num_elements() == outer_dims_size * quant_dim_size * inner_dims_size);
+
+    std::vector<float> dequantized_data;
+    dequantized_data.reserve(shape.num_elements());
+    for (size_t outer_it = 0; outer_it < outer_dims_size; ++outer_it)
+      for (int32_t channel = 0; channel < quant_dim_size; ++channel)
+      {
+        float scale = tensor.scales()[channel];
+        size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
+        std::vector<float> part_dequantized_data =
+          dequantize(data.data() + offset, inner_dims_size, scale, 0);
+        dequantized_data.insert(dequantized_data.end(), part_dequantized_data.begin(),
+                                part_dequantized_data.end());
+      }
+    return dequantized_data;
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+}
+
+Matcher<std::vector<float>> FloatArrayNear(const std::vector<float> &values, float max_abs_error)
+{
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float v : values)
+  {
+    matchers.emplace_back(FloatNear(v, max_abs_error));
+  }
+  return ElementsAreArray(matchers);
+}
+
+std::vector<int32_t> extractTensorShape(const Tensor &tensor)
+{
+  std::vector<int32_t> result;
+  int dims = tensor.shape().num_dims();
+  for (int i = 0; i < dims; i++)
+  {
+    result.push_back(tensor.shape().dim(i));
+  }
+  return result;
+}
+
+} // namespace testing
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.h b/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.h
new file mode 100644
index 000000000..1f5a0c308
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/TestUtils.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_TESTUTILS_H
+#define LUCI_INTERPRETER_KERNELS_TESTUTILS_H
+
+#include "luci_interpreter/core/Tensor.h"
+#include "luci_interpreter/MemoryManager.h"
+
+#include <type_traits>
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace testing
+{
+
+template <typename T>
+std::vector<T> quantize(const float *data, size_t num_elements, float scale, int32_t zero_point);
+
+template <DataType DT>
+Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeImpl<DT>::Type> &data,
+                       IMemoryManager *memory_manager)
+{
+  Tensor tensor(DT, shape, {}, "");
+  memory_manager->allocate_memory(tensor);
+  tensor.writeData(data.data(), data.size() * sizeof(typename DataTypeImpl<DT>::Type));
+  return tensor;
+}
+
+/**
+ * @brief Create layer-wise quantized tensor
+ * @tparam DT base integer data type, for example DataType::U8, DataType::S16, DataType::S64
+ * @param shape desired tensor shape
+ * @param scale scale of quantized number
+ * @param zero_point zero point of quantized number, should be 0 for signed datatypes
+ * @param data floating point data for quantization
+ * @param memory_manager memory manager for allocating memory to tensor
+ * @return created tensor
+ */
+template <DataType DT>
+Tensor makeInputTensor(const Shape &shape, float scale, int32_t zero_point,
+                       const std::vector<float> &data, IMemoryManager *memory_manager)
+{
+  using NativeT = typename DataTypeImpl<DT>::Type;
+  Tensor tensor(DT, shape, {{scale}, {zero_point}}, "");
+  std::vector<NativeT> quantized_data =
+    quantize<NativeT>(data.data(), data.size(), scale, zero_point);
+  memory_manager->allocate_memory(tensor);
+  tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
+  return tensor;
+}
+
+/**
+ * @brief Create channel-wise quantized tensor
+ * @tparam DT base integer data type, for example DataType::U8, DataType::S16, DataType::S64
+ * @param shape desired tensor shape
+ * @param scales scales of quantized number
+ * @param zero_points zero points of quantized number, should be 0 for signed datatypes
+ * @param quantize_dimension dimension to apply quantization along. Usually channels/output channels
+ * @param data floating point data for quantization
+ * @param memory_manager memory manager for allocating memory to tensor
+ * @return created tensor
+ */
+template <DataType DT>
+Tensor makeInputTensor(const Shape &shape, const std::vector<float> &scales,
+                       const std::vector<int32_t> &zero_points, int quantized_dimension,
+                       const std::vector<float> &data, IMemoryManager *memory_manager)
+{
+  using NativeT = typename DataTypeImpl<DT>::Type;
+  assert(quantized_dimension < shape.num_dims());
+  Tensor tensor(DT, shape, {scales, zero_points, quantized_dimension}, "");
+
+  // quantize_dimension breaks shape into two parts:
+  // inner dimensions that contains continuous data with one quantization type
+  // outer dimensions that contains other dimensions
+  size_t outer_dims_size = 1;
+  int32_t quant_dim_size = shape.dim(quantized_dimension);
+  size_t inner_dims_size = 1;
+  assert(quant_dim_size == scales.size());
+  assert(quant_dim_size == zero_points.size());
+
+  for (int i = 0; i < quantized_dimension; ++i)
+    outer_dims_size *= shape.dim(i);
+  for (int i = quantized_dimension + 1; i < shape.num_dims(); ++i)
+    inner_dims_size *= shape.dim(i);
+
+  assert(shape.num_elements() == outer_dims_size * quant_dim_size * inner_dims_size);
+
+  std::vector<NativeT> quantized_data;
+  quantized_data.reserve(shape.num_elements());
+  for (size_t outer_it = 0; outer_it < outer_dims_size; ++outer_it)
+    for (int32_t channel = 0; channel < quant_dim_size; ++channel)
+    {
+      int32_t zero_point = zero_points[channel];
+      float scale = scales[channel];
+      size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
+      std::vector<NativeT> part_quantized_data =
+        quantize<NativeT>(data.data() + offset, inner_dims_size, scale, zero_point);
+      quantized_data.insert(quantized_data.end(), part_quantized_data.begin(),
+                            part_quantized_data.end());
+    }
+  assert(quantized_data.size() == shape.num_elements());
+  memory_manager->allocate_memory(tensor);
+  tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
+  return tensor;
+}
+
+Tensor makeOutputTensor(DataType element_type);
+Tensor makeOutputTensor(DataType element_type, float scale, int32_t zero_point);
+
+std::vector<int32_t> extractTensorShape(const Tensor &tensor);
+
+// Returns the corresponding DataType given the type T.
+template <typename T> constexpr DataType getElementType()
+{
+  if (std::is_same<T, float>::value)
+    return DataType::FLOAT32;
+  if (std::is_same<T, double>::value)
+    return DataType::FLOAT64;
+  if (std::is_same<T, uint8_t>::value)
+    return DataType::U8;
+  if (std::is_same<T, uint16_t>::value)
+    return DataType::U16;
+  if (std::is_same<T, uint32_t>::value)
+    return DataType::U32;
+  if (std::is_same<T, uint64_t>::value)
+    return DataType::U64;
+  if (std::is_same<T, int8_t>::value)
+    return DataType::S8;
+  if (std::is_same<T, int16_t>::value)
+    return DataType::S16;
+  if (std::is_same<T, int32_t>::value)
+    return DataType::S32;
+  if (std::is_same<T, int64_t>::value)
+    return DataType::S64;
+  if (std::is_same<T, bool>::value)
+    return DataType::BOOL;
+  return DataType::Unknown;
+}
+
+template <typename T> std::vector<T> extractTensorData(const Tensor &tensor)
+{
+  const auto *data_ptr = tensor.data<T>();
+  return std::vector<T>(data_ptr, data_ptr + tensor.shape().num_elements());
+}
+
+std::vector<float> dequantizeTensorData(const Tensor &tensor);
+
+// Array version of `::testing::FloatNear` matcher.
+::testing::Matcher<std::vector<float>> FloatArrayNear(const std::vector<float> &values,
+                                                      float max_abs_error = 1.0e-5f);
+
+template <typename T>
+std::vector<T> quantize(const float *data, size_t num_elements, float scale, int32_t zero_point)
+{
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
+
+  float q_min{}, q_max{};
+  if (std::is_signed<T>::value)
+  {
+    q_min = -std::numeric_limits<T>::max();
+    q_max = std::numeric_limits<T>::max();
+  }
+  else
+  {
+    q_min = 0;
+    q_max = std::numeric_limits<T>::max();
+  }
+
+  std::vector<T> q;
+  for (size_t i = 0; i < num_elements; ++i)
+  {
+    const auto &f = data[i];
+    q.push_back(static_cast<T>(
+      std::max<float>(q_min, std::min<float>(q_max, std::round(zero_point + (f / scale))))));
+  }
+  return q;
+}
+
+template <typename T>
+std::vector<float> dequantize(const T *data, size_t num_elements, float scale, int32_t zero_point)
+{
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
+  std::vector<float> f;
+  for (size_t i = 0; i < num_elements; ++i)
+  {
+    const T &q = data[i];
+    f.push_back(scale * (q - zero_point));
+  }
+  return f;
+}
+
+// NOTE Returns scale and zero point for _asymmetric_ range (both signed and unsigned).
+template <typename T> std::pair<float, int32_t> quantizationParams(float f_min, float f_max)
+{
+  static_assert(std::is_integral<T>::value, "Integral type expected.");
+  int32_t zero_point = 0;
+  float scale = 0;
+  const T qmin = std::numeric_limits<T>::lowest();
+  const T qmax = std::numeric_limits<T>::max();
+  const float qmin_double = qmin;
+  const float qmax_double = qmax;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  assert(f_max >= 0);
+  assert(f_min <= 0);
+  if (f_min == f_max)
+  {
+    // Special case where the min,max range is a point. Should be {0}.
+    assert(f_max == 0);
+    assert(f_min == 0);
+    return {scale, zero_point};
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  scale = (f_max - f_min) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const float zero_point_from_min = qmin_double - f_min / scale;
+  const float zero_point_from_max = qmax_double - f_max / scale;
+
+  const float zero_point_from_min_error = std::abs(qmin_double) + std::abs(f_min / scale);
+
+  const float zero_point_from_max_error = std::abs(qmax_double) + std::abs(f_max / scale);
+
+  const float zero_point_double = zero_point_from_min_error < zero_point_from_max_error
+                                    ? zero_point_from_min
+                                    : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  //  padding).
+
+  T nudged_zero_point = 0;
+  if (zero_point_double < qmin_double)
+  {
+    nudged_zero_point = qmin;
+  }
+  else if (zero_point_double > qmax_double)
+  {
+    nudged_zero_point = qmax;
+  }
+  else
+  {
+    nudged_zero_point = static_cast<T>(std::round(zero_point_double));
+  }
+
+  // The zero point should always be in the range of quantized value,
+  // // [qmin, qmax].
+  assert(qmax >= nudged_zero_point);
+  assert(qmin <= nudged_zero_point);
+  zero_point = nudged_zero_point;
+  // finally, return the values
+  return {scale, zero_point};
+}
+
+inline float getTolerance(float min, float max, int quantize_steps)
+{
+  return ((max - min) / quantize_steps);
+}
+
+} // namespace testing
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_TESTUTILS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.cpp
new file mode 100644
index 000000000..802d87295
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Transpose.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/transpose.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Transpose::Transpose(const Tensor *input, const Tensor *perm, Tensor *output)
+  : Kernel({input, perm}, {output})
+{
+}
+
+void Transpose::configure()
+{
+  // Transpose op only supports 1D-4D input arrays.
+  int dims = input()->shape().num_dims();
+  const int32_t *perm_data = getTensorData<int32_t>(perm());
+
+  assert(input()->shape().num_dims() <= 4);
+  assert(input()->element_type() == output()->element_type());
+
+  assert(perm()->shape().num_dims() == 1);
+  assert(perm()->shape().dim(0) == dims);
+
+  Shape output_shape(dims);
+  for (int i = 0; i < dims; i++)
+  {
+    assert(perm_data[i] < dims && perm_data[i] >= 0);
+    output_shape.dim(i) = input()->shape().dim(perm_data[i]);
+  }
+
+  output()->resize(output_shape);
+}
+
+void Transpose::execute() const
+{
+  tflite::TransposeParams params{};
+  const int32_t *perm_data = getTensorData<int32_t>(perm());
+  const int32_t size = perm()->shape().dim(0);
+  params.perm_count = size;
+  for (int i = 0; i < size; i++)
+    params.perm[i] = perm_data[i];
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::Transpose(params, getTensorShape(input()),
+                                       getTensorData<float>(input()), getTensorShape(output()),
+                                       getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::reference_ops::Transpose(params, getTensorShape(input()),
+                                       getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                       getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.h b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.h
new file mode 100644
index 000000000..d6f89c352
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_TRANSPOSE_H
+#define LUCI_INTERPRETER_KERNELS_TRANSPOSE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Transpose : public Kernel
+{
+public:
+  Transpose(const Tensor *input, const Tensor *perm, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *perm() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_TRANSPOSE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.test.cpp
new file mode 100644
index 000000000..43be8f8b9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Transpose.test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Transpose.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> perm_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
+           std::initializer_list<int32_t> perm_data, std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor perm_tensor = makeInputTensor<DataType::S32>(perm_shape, perm_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  Transpose kernel(&input_tensor, &perm_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+}
+
+template <typename T> class TransposeTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(TransposeTest, DataTypes);
+
+TYPED_TEST(TransposeTest, Small3D)
+{
+  Check<TypeParam>(/*input_shape=*/{2, 3, 4}, /*perm_shape=*/{3}, /*output_shape=*/{4, 2, 3},
+                   /*input_data=*/{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                                   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                   /*perm_data=*/{2, 0, 1},
+                   /*output_data=*/{0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                    2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23});
+}
+
+TYPED_TEST(TransposeTest, Large4D)
+{
+  Check<TypeParam>(
+    /*input_shape=*/{2, 3, 4, 5}, /*perm_shape=*/{4}, /*output_shape=*/{4, 2, 3, 5},
+    /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+                    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+                    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+                    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+                    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+                    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
+    /*perm_data=*/{2, 0, 1, 3},
+    /*output_data=*/{0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+                     60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+                     5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+                     65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+                     10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+                     70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+                     15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+                     75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+}
+
+TYPED_TEST(TransposeTest, Large2D)
+{
+  Check<TypeParam>(
+    /*input_shape=*/{10, 12}, /*perm_shape=*/{2}, /*output_shape=*/{12, 10},
+    /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+                    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+                    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+                    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+                    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+                    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
+    /*perm_data=*/{1, 0},
+    /*output_data=*/{0,  12, 24, 36,  48,  60, 72, 84, 96,  108, 1,  13, 25, 37,  49,
+                     61, 73, 85, 97,  109, 2,  14, 26, 38,  50,  62, 74, 86, 98,  110,
+                     3,  15, 27, 39,  51,  63, 75, 87, 99,  111, 4,  16, 28, 40,  52,
+                     64, 76, 88, 100, 112, 5,  17, 29, 41,  53,  65, 77, 89, 101, 113,
+                     6,  18, 30, 42,  54,  66, 78, 90, 102, 114, 7,  19, 31, 43,  55,
+                     67, 79, 91, 103, 115, 8,  20, 32, 44,  56,  68, 80, 92, 104, 116,
+                     9,  21, 33, 45,  57,  69, 81, 93, 105, 117, 10, 22, 34, 46,  58,
+                     70, 82, 94, 106, 118, 11, 23, 35, 47,  59,  71, 83, 95, 107, 119});
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.cpp
new file mode 100644
index 000000000..1b5f9d941
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/TransposeConv.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/transpose_conv.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
+                             const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
+                             const TransposeConvParams &params)
+  : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias},
+                                          {output, scratch_tensor}, params)
+{
+}
+
+TransposeConv::~TransposeConv()
+{
+  // Define destructor here, to delete vector of qunatized multipliers properly
+}
+
+void TransposeConv::configure()
+{
+  assert(output_shape()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() == 4);
+  assert(filter()->shape().num_dims() == 4);
+  assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8 ||
+         input()->element_type() == DataType::S16);
+  assert(input()->element_type() == output()->element_type());
+  assert(input()->shape().dim(3) == filter()->shape().dim(3));
+
+  const int num_dims = output_shape()->shape().dim(0);
+  Shape out_shape(num_dims);
+  const auto *shape_data = getTensorData<int32_t>(output_shape());
+  for (int i = 0; i < num_dims; i++)
+    out_shape.dim(i) = shape_data[i];
+  output()->resize(out_shape);
+
+  const int32_t filter_height = filter()->shape().dim(1);
+  const int32_t filter_width = filter()->shape().dim(2);
+  const int32_t output_height = out_shape.dim(1);
+  const int32_t output_width = out_shape.dim(2);
+
+  const int32_t unused_output_height =
+    computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
+  const int32_t unused_output_width =
+    computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
+
+  _padding_height =
+    computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
+  _padding_width =
+    computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
+
+  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
+  {
+    auto scratch_tensor = getOutputTensors()[1];
+    scratch_tensor->resize(output()->shape());
+    const std::vector<double> real_multipliers =
+      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+    _quant_multipliers = quantizeMultipliers(real_multipliers);
+  }
+  else
+  {
+    auto scratch_tensor = getOutputTensors()[1];
+    scratch_tensor->set_allocatable(false);
+  }
+}
+
+void TransposeConv::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    case DataType::U8:
+      if (filter()->scales().size() == 1)
+      {
+        evalQuantized();
+      }
+      else if (filter()->scales().size() > 1)
+      {
+        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                               static_cast<size_t>(filter()->shape().dim(0)));
+        evalQuantizedPerChannel();
+      }
+      break;
+    case DataType::S16:
+      evalQuantizedS16();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void TransposeConv::evalFloat() const
+{
+  tflite::ConvParams op_params{};
+  op_params.padding_type = tflite::PaddingType::kSame;
+  op_params.padding_values.height = _padding_height;
+  op_params.padding_values.width = _padding_width;
+  op_params.stride_height = params().stride_height;
+  op_params.stride_width = params().stride_width;
+  tflite::reference_ops::TransposeConv(op_params,                                                //
+                                       getTensorShape(input()), getTensorData<float>(input()),   //
+                                       getTensorShape(filter()), getTensorData<float>(filter()), //
+                                       getTensorShape(bias()), getTensorData<float>(bias()),     //
+                                       getTensorShape(output()), getTensorData<float>(output()), //
+                                       tflite::RuntimeShape(), nullptr);
+}
+
+void TransposeConv::evalQuantized() const
+{
+  tflite::ConvParams op_params{};
+  op_params.padding_type = tflite::PaddingType::kSame;
+  op_params.padding_values.height = _padding_height;
+  op_params.padding_values.width = _padding_width;
+  op_params.stride_height = params().stride_height;
+  op_params.stride_width = params().stride_width;
+  // The kernel expects input and filter zero points to be negated.
+  op_params.input_offset = -input()->zero_point();    // Note the '-'.
+  op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
+  op_params.output_offset = output()->zero_point();
+  op_params.output_multiplier = _quant_multipliers[0].multiplier;
+  op_params.output_shift = _quant_multipliers[0].shift;
+  op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
+
+  auto scratch_tensor = getOutputTensors()[1];
+
+  tflite::reference_ops::TransposeConv(op_params,                                                //
+                                       getTensorShape(input()), getTensorData<uint8>(input()),   //
+                                       getTensorShape(filter()), getTensorData<uint8>(filter()), //
+                                       getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
+                                       getTensorShape(output()), getTensorData<uint8>(output()), //
+                                       tflite::RuntimeShape(), nullptr,                          //
+                                       getTensorData<int32_t>(scratch_tensor));
+}
+
+void TransposeConv::evalQuantizedPerChannel() const
+{
+  const auto *input_data = getTensorData<uint8_t>(input());
+  const auto *filter_data = getTensorData<uint8_t>(filter());
+  const auto *bias_data = getTensorData<int32_t>(bias());
+  auto *output_data = getTensorData<uint8_t>(output());
+
+  auto scratch_tensor = getOutputTensors()[1];
+  auto *scratch_data = getTensorData<int32_t>(scratch_tensor);
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
+
+  std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int32_t));
+
+  BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t in_y = 0; in_y < input_height; ++in_y)
+    {
+      for (int32_t in_x = 0; in_x < input_width; ++in_x)
+      {
+        for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+        {
+          const int32_t out_y_origin = in_y * stride_height - _padding_height;
+          const int32_t out_x_origin = in_x * stride_width - _padding_width;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t out_x = out_x_origin + filter_x;
+              const int32_t out_y = out_y_origin + filter_y;
+              if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
+              {
+                for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+                {
+                  const uint8_t input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const uint8_t filter_val =
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
+                    static_cast<int32_t>(input_val - input()->zero_point()) *
+                    static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+            acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
+
+          scaled_acc += output()->zero_point();
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
+void TransposeConv::evalQuantizedS16() const
+{
+  const auto *input_data = getTensorData<int16_t>(input());
+  const auto *filter_data = getTensorData<int16_t>(filter());
+  const auto *bias_data = getTensorData<int64_t>(bias());
+  auto *output_data = getTensorData<int16_t>(output());
+
+  auto scratch_tensor = getOutputTensors()[1];
+  auto *scratch_data = getTensorData<int64_t>(scratch_tensor);
+
+  const Shape &input_shape = input()->shape();
+  const Shape &filter_shape = filter()->shape();
+  const Shape &output_shape = output()->shape();
+
+  const int32_t batches = input_shape.dim(0);
+  const int32_t input_height = input_shape.dim(1);
+  const int32_t input_width = input_shape.dim(2);
+  const int32_t input_depth = input_shape.dim(3);
+  const int32_t output_depth = filter_shape.dim(0);
+  const int32_t filter_height = filter_shape.dim(1);
+  const int32_t filter_width = filter_shape.dim(2);
+  const int32_t output_height = output_shape.dim(1);
+  const int32_t output_width = output_shape.dim(2);
+
+  const int32_t stride_height = _params.stride_height;
+  const int32_t stride_width = _params.stride_width;
+
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
+
+  std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int64_t));
+
+  BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
+  for (int32_t batch = 0; batch < batches; ++batch)
+  {
+    for (int32_t in_y = 0; in_y < input_height; ++in_y)
+    {
+      for (int32_t in_x = 0; in_x < input_width; ++in_x)
+      {
+        for (int32_t in_c = 0; in_c < input_depth; ++in_c)
+        {
+          const int32_t out_y_origin = in_y * stride_height - _padding_height;
+          const int32_t out_x_origin = in_x * stride_width - _padding_width;
+          for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int32_t out_x = out_x_origin + filter_x;
+              const int32_t out_y = out_y_origin + filter_y;
+              if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
+              {
+                for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+                {
+                  const int16_t input_val =
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                  const int16_t filter_val =
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                  scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
+                    static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (int32_t out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int32_t out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int32_t out_c = 0; out_c < output_depth; ++out_c)
+        {
+          int64_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
+          if (bias_data)
+          {
+            acc += bias_data[out_c];
+          }
+          int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
+            acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
+
+          scaled_acc = std::max(scaled_acc, activation_min);
+          scaled_acc = std::min(scaled_acc, activation_max);
+
+          output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
+        }
+      }
+    }
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.h b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.h
new file mode 100644
index 000000000..cea0cf3c7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_TRANSPOSECONV_H
+#define LUCI_INTERPRETER_KERNELS_TRANSPOSECONV_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ChannelQuantMultipliers;
+
+class TransposeConv : public KernelWithParams<TransposeConvParams>
+{
+public:
+  TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
+                const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
+                const TransposeConvParams &params);
+
+  ~TransposeConv();
+
+  const Tensor *output_shape() const { return _inputs[0]; }
+  const Tensor *filter() const { return _inputs[1]; }
+  const Tensor *input() const { return _inputs[2]; }
+  const Tensor *bias() const { return _inputs[3]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+  void evalQuantized() const;
+  void evalQuantizedPerChannel() const;
+  void evalQuantizedS16() const;
+
+private:
+  int32_t _padding_height{};
+  int32_t _padding_width{};
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  std::vector<ChannelQuantMultipliers> _quant_multipliers;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_TRANSPOSECONV_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.test.cpp
new file mode 100644
index 000000000..4856e1b87
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/TransposeConv.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T, typename B>
+void Check(std::initializer_list<int32_t> output_shape_shape,
+           std::initializer_list<int32_t> weight_shape, std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<int32_t> output_shape_data, std::initializer_list<T> weight_data,
+           std::initializer_list<T> input_data, std::initializer_list<B> bias_data,
+           std::initializer_list<T> output_data, luci::Padding padding, int32_t stride_height,
+           int32_t stride_width)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  constexpr DataType element_type = getElementType<T>();
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data, memory_manager.get());
+  Tensor weight_tensor =
+    makeInputTensor<element_type>(weight_shape, weight_data, memory_manager.get());
+  Tensor input_data_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+
+  DataType scratch_data_type = element_type == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  TransposeConvParams params{};
+  params.padding = padding;
+  params.stride_height = stride_height;
+  params.stride_width = stride_width;
+
+  if (bias_data.size() != 0)
+  {
+    Tensor bias_tensor =
+      makeInputTensor<getElementType<B>()>(bias_shape, bias_data, memory_manager.get());
+    TransposeConv kernel(&output_shape_tensor, &weight_tensor, &input_data_tensor, &bias_tensor,
+                         &output_tensor, &scratch_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    memory_manager->allocate_memory(scratch_tensor);
+    kernel.execute();
+  }
+  else
+  {
+    TransposeConv kernel(&output_shape_tensor, &weight_tensor, &input_data_tensor, nullptr,
+                         &output_tensor, &scratch_tensor, params);
+    kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    memory_manager->allocate_memory(scratch_tensor);
+    kernel.execute();
+  }
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+}
+
+TEST(TransposeConvTest, FloatSimple)
+{
+  Check<float, float>(
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 1}, /*input_shape=*/{1, 4, 4, 1},
+    /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
+    /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9},
+    /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    /*bias_data=*/{},
+    /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
+    /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
+
+  SUCCEED();
+}
+
+TEST(TransposeConvTest, FloatTwoFiltersTest)
+{
+  Check<float, float>(
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 2}, /*input_shape=*/{1, 4, 4, 2},
+    /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
+    /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
+    /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+    /*bias_data=*/{},
+    /*output_data=*/
+    {184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760},
+    /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
+
+  SUCCEED();
+}
+
+TEST(TransposeConvTest, SimpleBiasTest)
+{
+  Check<float, float>(
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{2, 3, 3, 1},
+    /*input_shape=*/{1, 2, 2, 1},
+    /*bias_shape=*/{2}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 5, 5, 2},
+    /*weight_data=*/{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+    /*input_data=*/{1, 2, 3, 4},
+    /*bias_data=*/{3, 4},
+    /*output_data=*/{4,  6,  6,  8,  10, 14, 9,  12, 13, 16, 10,  12,  12, 14, 28, 32, 21,
+                     24, 25, 28, 19, 24, 27, 32, 65, 76, 45, 52,  57,  64, 24, 28, 30, 34,
+                     64, 72, 39, 44, 47, 52, 42, 46, 48, 52, 106, 114, 63, 68, 71, 76},
+    /*params.padding=*/luci::Padding::VALID, /*stride_height=*/2, /*stride_width=*/2);
+
+  SUCCEED();
+}
+
+TEST(TransposeConvTest, UInt8)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+  std::vector<int32_t> output_shape_data{1, 5, 5, 2};
+  std::vector<float> ref_output_data{
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  // Choose quantization parameters carefully.
+  auto input_quant = quantizationParams<uint8_t>(-8.0, 7.9375);  // s = 1 / 16, zp = 128
+  auto filter_quant = quantizationParams<uint8_t>(-24.0, 39.75); // s = 1 / 4, zp = 96
+  auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 1}, input_quant.first, input_quant.second, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(
+    {2, 3, 3, 1}, filter_quant.first, filter_quant.second, filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first,
+                                                      0, bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
+
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, &scratch_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(TransposeConvTest, UInt8_CWQ)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  const int32_t output_channels = 2;
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+  std::vector<int32_t> output_shape_data{1, 5, 5, 2};
+  std::vector<float> ref_output_data{
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  // Choose quantization parameters carefully.
+  auto input_quant = quantizationParams<uint8_t>(-8.0, 7.9375);  // s = 1 / 16, zp = 128
+  auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 17));
+  filter_quant_params.push_back(quantizationParams<uint8_t>(0, 18));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 1}, input_quant.first, input_quant.second, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(
+    {output_channels, 3, 3, 1}, filter_scales, filter_zerops, 0, filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
+
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, &scratch_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(TransposeConvTest, SInt16)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+  std::vector<int32_t> output_shape_data{1, 5, 5, 2};
+  std::vector<float> ref_output_data{
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 2, 1}, 0.25, 0, input_data, memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>({2, 3, 3, 1}, 0.2, 0, filter_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>({2}, 0.25 * 0.2, 0, bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
+
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, &scratch_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(TransposeConvTest, SInt16_CWQ_weights)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  const int output_channels = 2;
+  const Shape input_shape{1, 2, 2, 1};
+  const Shape filter_shape{output_channels, 3, 3, 1};
+  const Shape bias_shape{output_channels};
+  std::vector<int32_t> output_shape_data{1, 5, 5, output_channels};
+
+  std::vector<float> input_data{1, 2, 3, 4};
+  std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  std::vector<float> bias_data{3, 4};
+
+  std::vector<float> ref_output_data{
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+  };
+
+  const float input_scale = 0.25;
+  const float output_scale = 0.5;
+  const std::vector<float> filter_scales{0.2f, 0.5f};
+  std::vector<float> bias_scales{filter_scales[0] * input_scale, filter_scales[1] * input_scale};
+  const std::vector<int32_t> zerop(2, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0,
+                                                        filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
+
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
+  TransposeConvParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+
+  TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
+                       &output_tensor, &scratch_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.cpp
new file mode 100644
index 000000000..9127241c0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Unpack.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Unpack::Unpack(const Tensor *input, std::vector<Tensor *> outputs, const UnpackParams &params)
+  : KernelWithParams<UnpackParams>({input}, std::move(outputs), params)
+{
+}
+
+void Unpack::configure()
+{
+  const Shape &input_shape = input()->shape();
+
+  int axis = _params.axis;
+  if (axis < 0)
+    axis += input()->shape().num_dims();
+  assert(axis >= 0 && axis < input_shape.num_dims());
+
+  Shape output_shape(input_shape.num_dims() - 1);
+  int out_index = 0;
+  for (int in_index = 0; in_index < input_shape.num_dims(); ++in_index)
+  {
+    if (in_index != axis)
+      output_shape.dim(out_index++) = input_shape.dim(in_index);
+  }
+
+  for (Tensor *output : _outputs)
+  {
+    assert(output->element_type() == input()->element_type());
+    output->resize(output_shape);
+  }
+}
+
+template <typename T> void Unpack::executeImpl() const
+{
+  tflite::UnpackParams params{};
+  params.axis = _params.axis;
+  params.num_split = _outputs.size();
+  VectorOfTensors<T, false> all_outputs(_outputs);
+  tflite::reference_ops::Unpack<T>(params, getTensorShape(input()), getTensorData<T>(input()),
+                                   **all_outputs.shapes(), all_outputs.data());
+}
+
+void Unpack::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      return executeImpl<float>();
+    case DataType::U8:
+      return executeImpl<uint8_t>();
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.h b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.h
new file mode 100644
index 000000000..f4a44ecad
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_UNPACK_H
+#define LUCI_INTERPRETER_KERNELS_UNPACK_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Unpack : public KernelWithParams<UnpackParams>
+{
+public:
+  Unpack(const Tensor *input, std::vector<Tensor *> outputs, const UnpackParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void executeImpl() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_UNPACK_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.test.cpp
new file mode 100644
index 000000000..9384ddc83
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Unpack.test.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Unpack.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(int axis, Shape input_shape, std::initializer_list<T> input_data,
+           const std::vector<std::initializer_list<int32_t>> &exp_output_shape,
+           std::vector<std::initializer_list<T>> exp_output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+  const int num_outputs = input_shape.dim(axis < 0 ? axis + input_shape.num_dims() : axis);
+
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(num_outputs);
+  for (int i = 0; i < num_outputs; ++i)
+  {
+    output_tensors.push_back(makeOutputTensor(element_type));
+  }
+
+  std::vector<Tensor *> output_tensor_ptrs(num_outputs);
+  for (int i = 0; i < num_outputs; ++i)
+  {
+    output_tensor_ptrs[i] = &output_tensors[i];
+  }
+
+  UnpackParams params{};
+  params.axis = axis;
+
+  Unpack kernel(&input_tensor, std::move(output_tensor_ptrs), params);
+  kernel.configure();
+  for (int i = 0; i < num_outputs; i++)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
+  kernel.execute();
+
+  for (int i = 0; i < num_outputs; ++i)
+  {
+    EXPECT_THAT(extractTensorData<T>(output_tensors[i]),
+                ::testing::ElementsAreArray(exp_output_data[i]));
+  }
+}
+
+template <typename T> class UnpackTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_SUITE(UnpackTest, DataTypes);
+
+TYPED_TEST(UnpackTest, ThreeOutputs)
+{
+  Check<TypeParam>(/*axis=*/0, /*input_shape=*/{3, 2},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6},
+                   /*exp_output_shape=*/{{2}, {2}, {2}},
+                   /*exp_output_data=*/{{1, 2}, {3, 4}, {5, 6}});
+}
+
+TYPED_TEST(UnpackTest, ThreeOutputsAxisOne)
+{
+  Check<TypeParam>(/*axis=*/1, /*input_shape=*/{3, 2},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6},
+                   /*exp_output_shape=*/{{3}, {3}},
+                   /*exp_output_data=*/{{1, 3, 5}, {2, 4, 6}});
+}
+
+TYPED_TEST(UnpackTest, ThreeOutputsNegativeAxisOne)
+{
+  Check<TypeParam>(/*axis=*/-1, /*input_shape=*/{3, 2},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6},
+                   /*exp_output_shape=*/{{3}, {3}},
+                   /*exp_output_data=*/{{1, 3, 5}, {2, 4, 6}});
+}
+
+TYPED_TEST(UnpackTest, ThreeOutputsNegativeAxisTwo)
+{
+  Check<TypeParam>(/*axis=*/-2, /*input_shape=*/{3, 2},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6},
+                   /*exp_output_shape=*/{{2}, {2}, {2}},
+                   /*exp_output_data=*/{{1, 2}, {3, 4}, {5, 6}});
+}
+
+TYPED_TEST(UnpackTest, OneOutput)
+{
+  Check<TypeParam>(/*axis=*/0, /*input_shape=*/{1, 6},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6},
+                   /*exp_output_shape=*/{{6}},
+                   /*exp_output_data=*/{{1, 2, 3, 4, 5, 6}});
+}
+
+TYPED_TEST(UnpackTest, ThreeDimensionsTwoOutputs)
+{
+  Check<TypeParam>(/*axis=*/2, /*input_shape=*/{2, 2, 2},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                   /*exp_output_shape=*/{{2, 2}, {2, 2}},
+                   /*exp_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}});
+}
+
+TYPED_TEST(UnpackTest, FiveDimensionsTwoOutputs)
+{
+  Check<TypeParam>(
+    /*axis=*/2, /*input_shape=*/{2, 2, 2, 2, 1},
+    /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    /*exp_output_shape=*/{{2, 2, 2, 1}, {2, 2, 2, 1}},
+    /*exp_output_data=*/
+    {{1, 2, 5, 6, 9, 10, 13, 14}, {3, 4, 7, 8, 11, 12, 15, 16}});
+}
+
+TYPED_TEST(UnpackTest, VectorToScalar)
+{
+  Check<TypeParam>(/*axis=*/0, /*input_shape=*/{5},
+                   /*input_data=*/{1, 2, 3, 4, 5},
+                   /*exp_output_shape=*/{{}, {}, {}, {}, {}},
+                   /*exp_output_data=*/{{1}, {2}, {3}, {4}, {5}});
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Utils.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/Utils.cpp
new file mode 100644
index 000000000..5d8e5db83
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Utils.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Utils.h"
+
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+template <typename T>
+void calculateActivationRange(Activation activation, T *activation_min, T *activation_max)
+{
+  switch (activation)
+  {
+    case Activation::NONE:
+      *activation_min = std::numeric_limits<T>::lowest();
+      *activation_max = std::numeric_limits<T>::max();
+      break;
+    case Activation::RELU:
+      *activation_min = 0;
+      *activation_max = std::numeric_limits<T>::max();
+      break;
+    case Activation::RELU_N1_TO_1:
+      *activation_min = -1;
+      *activation_max = 1;
+      break;
+    case Activation::RELU6:
+      *activation_min = 0;
+      *activation_max = 6;
+      break;
+    default:
+      throw std::runtime_error("Unsupported activation.");
+  }
+}
+
+template void calculateActivationRange(Activation activation, float *activation_min,
+                                       float *activation_max);
+template void calculateActivationRange(Activation activation, int32_t *activation_min,
+                                       int32_t *activation_max);
+template void calculateActivationRange(Activation activation, int64_t *activation_min,
+                                       int64_t *activation_max);
+
+static void calculateActivationRangeQuantizedImpl(Activation activation, int32_t qmin, int32_t qmax,
+                                                  const Tensor *output, int32_t *activation_min,
+                                                  int32_t *activation_max)
+{
+  const float scale = output->scale();
+  const int32_t zero_point = output->zero_point();
+
+  auto quantize = [scale, zero_point](float x) {
+    return zero_point + static_cast<int32_t>(std::round(x / scale));
+  };
+
+  switch (activation)
+  {
+    case Activation::NONE:
+    case Activation::TANH:
+      *activation_min = qmin;
+      *activation_max = qmax;
+      break;
+    case Activation::RELU:
+      *activation_min = std::max(qmin, quantize(0.0f));
+      *activation_max = qmax;
+      break;
+    case Activation::RELU_N1_TO_1:
+      *activation_min = std::max(qmin, quantize(-1.0f));
+      *activation_max = std::min(qmax, quantize(1.0f));
+      break;
+    case Activation::RELU6:
+      *activation_min = std::max(qmin, quantize(0.0f));
+      *activation_max = std::min(qmax, quantize(6.0f));
+      break;
+    default:
+      throw std::runtime_error("Unsupported activation.");
+  }
+}
+
+void calculateActivationRangeQuantized(Activation activation, const Tensor *output,
+                                       int32_t *activation_min, int32_t *activation_max)
+{
+  assert(output->zero_points().size() == 1);
+  int32_t qmin{};
+  int32_t qmax{};
+  switch (output->element_type())
+  {
+    case DataType::U8:
+      qmin = 0;
+      qmax = std::numeric_limits<uint8_t>::max();
+      break;
+    case DataType::S8:
+      qmin = -std::numeric_limits<int8_t>::max();
+      qmax = std::numeric_limits<int8_t>::max();
+      break;
+    case DataType::S16:
+      // For now, assume that signed int16 type implies signed symmetric quantization.
+      assert(output->zero_point() == 0);
+      qmin = -std::numeric_limits<int16_t>::max();
+      qmax = std::numeric_limits<int16_t>::max();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+
+  calculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, activation_min,
+                                        activation_max);
+}
+
+void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
+{
+  if (double_multiplier == 0.0)
+  {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (INT64_C(1) << 31)));
+
+  if (q_fixed == (INT64_C(1) << 31))
+  {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (*shift < -31)
+  {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void quantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier,
+                                         int *left_shift)
+{
+  assert(double_multiplier < 1.0);
+  assert(double_multiplier > 0.0);
+  int shift;
+  quantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
+  assert(shift <= 0);
+  *left_shift = shift;
+}
+
+Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_shape)
+{
+  const int num_input1_dims = input1_shape.num_dims();
+  const int num_input2_dims = input2_shape.num_dims();
+  const int num_out_dims = std::max(num_input1_dims, num_input2_dims);
+  Shape output_shape(num_out_dims);
+
+  for (int i = 0; i < num_out_dims; ++i)
+  {
+    const int32_t input1_dim = i < num_input1_dims ? input1_shape.dim(num_input1_dims - i - 1) : 1;
+    const int32_t input2_dim = i < num_input2_dims ? input2_shape.dim(num_input2_dims - i - 1) : 1;
+
+    bool need_broadcast = input1_dim != input2_dim;
+    bool can_broadcast = input1_dim == 1 || input2_dim == 1;
+    LUCI_INTERPRETER_CHECK(!need_broadcast || can_broadcast);
+
+    output_shape.dim(num_out_dims - i - 1) = std::max(input1_dim, input2_dim);
+  }
+
+  return output_shape;
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/Utils.h b/compiler/luci-micro/luci-interpreter/src/kernels/Utils.h
new file mode 100644
index 000000000..ebeb20e66
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/Utils.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_UTILS_H
+#define LUCI_INTERPRETER_KERNELS_UTILS_H
+
+#include "core/KernelParams.h"
+#include "luci_interpreter/core/Tensor.h"
+
+#include <tensorflow/lite/kernels/internal/types.h>
+
+#include <cassert>
+#include <cstdint>
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+#define LUCI_INTERPRETER_CHECK(cond)                                                         \
+  if (!(cond))                                                                               \
+    throw std::runtime_error(std::string(__FILE__) + ":" + std::to_string(__LINE__) + +"(" + \
+                             std::string(#cond) + ") was not true.");
+
+inline int32_t computePadding(int32_t stride, int32_t dilation_rate, int32_t in_size,
+                              int32_t filter_size, int32_t out_size)
+{
+  const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  const int32_t padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
+  return padding > 0 ? padding : 0;
+}
+
+inline int32_t computePaddingWithOffset(int32_t stride, int32_t dilation_rate, int32_t in_size,
+                                        int32_t filter_size, int32_t out_size, int32_t *offset)
+{
+  int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int32_t total_padding = ((out_size - 1) * stride + effective_filter_size - in_size);
+  total_padding = total_padding > 0 ? total_padding : 0;
+  *offset = total_padding % 2;
+  return total_padding / 2;
+}
+
+inline int32_t computeOutputSize(Padding padding, int32_t image_size, int32_t filter_size,
+                                 int32_t stride, int32_t dilation_rate = 1)
+{
+  const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  switch (padding)
+  {
+    case Padding::SAME:
+      return (image_size + stride - 1) / stride;
+    case Padding::VALID:
+      return (image_size + stride - effective_filter_size) / stride;
+    default:
+      assert(false);
+      return 0;
+  }
+}
+
+inline int32_t calcOffset(const Shape &shape, int32_t d0, int32_t d1, int32_t d2, int32_t d3)
+{
+  return ((d0 * shape.dim(1) + d1) * shape.dim(2) + d2) * shape.dim(3) + d3;
+}
+
+template <typename T>
+void calculateActivationRange(Activation activation, T *activation_min, T *activation_max);
+
+void calculateActivationRangeQuantized(Activation activation, const Tensor *output,
+                                       int32_t *activation_min, int32_t *activation_max);
+
+template <typename T> constexpr bool one_of_types() { return false; }
+
+// Checks if T is equal to one of {U,Other} types
+template <typename T, typename U, typename... Other> constexpr bool one_of_types()
+{
+  return std::is_same<T, U>::value || one_of_types<T, Other...>();
+}
+
+/**
+ * Fills activation min and max parameters depending on given data type and activation
+ *
+ * T is a template parameter, so after optimization this code left with only required if case
+ *
+ * @tparam T data type of arithmetic operation output tensor
+ * @param params tflite params to fill
+ * @param activation luci_interpreter::Activation of arithmetic operation
+ */
+template <typename T>
+void fillArithmeticActivationRange(tflite::ArithmeticParams &p, Activation act)
+{
+  static_assert(one_of_types<T, float, int32_t, int64_t>(), "Unsupported dtype");
+
+  if (std::is_same<T, float>::value)
+    calculateActivationRange(act, &p.float_activation_min, &p.float_activation_max);
+  if (std::is_same<T, int32_t>::value)
+    calculateActivationRange(act, &p.quantized_activation_min, &p.quantized_activation_max);
+  else
+    calculateActivationRange(act, &p.int64_activation_min, &p.int64_activation_max);
+}
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void quantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier,
+                                         int *left_shift);
+
+Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_shape);
+
+inline double getQuantizedConvolutionMultipler(float input_scale, float filter_scale,
+                                               float output_scale)
+{
+  const double input_product_scale = static_cast<double>(input_scale * filter_scale);
+  LUCI_INTERPRETER_CHECK(input_product_scale >= 0);
+  return input_product_scale / static_cast<double>(output_scale);
+}
+
+// TODO rename getQuantizedConvolutionMultiplers to something more general
+// it is used for non conv operators too
+inline std::vector<double> getQuantizedConvolutionMultiplers(float input_scale,
+                                                             const std::vector<float> &filter_scale,
+                                                             float output_scale)
+{
+  std::vector<double> effective_output_scales;
+  size_t n = filter_scale.size();
+  effective_output_scales.reserve(n);
+  for (size_t i = 0; i < n; ++i)
+  {
+    effective_output_scales.push_back(
+      getQuantizedConvolutionMultipler(input_scale, filter_scale[i], output_scale));
+  }
+  return effective_output_scales;
+}
+
+struct ChannelQuantMultipliers
+{
+  int shift;
+  int32_t multiplier;
+  ChannelQuantMultipliers() = default;
+};
+
+inline std::vector<ChannelQuantMultipliers>
+quantizeMultipliers(const std::vector<double> &effective_scale)
+{
+  size_t n = effective_scale.size();
+  std::vector<ChannelQuantMultipliers> params(n);
+  for (size_t i = 0; i < n; ++i)
+  {
+    quantizeMultiplier(effective_scale[i], &params[i].multiplier, &params[i].shift);
+  }
+  return params;
+}
+
+// Helper wrapper to hide broadcast logic
+template <typename T> class BroadcastableWrapper
+{
+public:
+  BroadcastableWrapper(const std::vector<T> &v) : _v(v), _stride(v.size() == 1 ? 0 : 1) {}
+
+  T operator[](int idx) { return _v[idx * _stride]; }
+
+private:
+  const std::vector<T> &_v;
+  int _stride;
+};
+
+inline tflite::RuntimeShape getTensorShape(const Tensor *tensor)
+{
+  if (tensor == nullptr)
+    return tflite::RuntimeShape();
+
+  const Shape &shape = tensor->shape();
+  tflite::RuntimeShape runtime_shape(shape.num_dims());
+  for (int i = 0; i < shape.num_dims(); ++i)
+  {
+    runtime_shape.SetDim(i, shape.dim(i));
+  }
+  return runtime_shape;
+}
+
+template <typename T> const T *getTensorData(const Tensor *tensor)
+{
+  return tensor != nullptr ? tensor->data<T>() : nullptr;
+}
+
+template <typename T> T *getTensorData(Tensor *tensor)
+{
+  return tensor != nullptr ? tensor->data<T>() : nullptr;
+}
+
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T, bool is_const> class VectorOfTensors
+{
+public:
+  using ElementT = typename std::conditional<is_const, const T, T>::type;
+  using TensorT = typename std::conditional<is_const, const Tensor, Tensor>::type;
+
+  // Build with the tensors in 'tensor_list'.
+  explicit VectorOfTensors(const std::vector<TensorT *> &tensor_list)
+  {
+    const int num_tensors = tensor_list.size();
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (TensorT *tensor : tensor_list)
+    {
+      all_data_.push_back(getTensorData<T>(tensor));
+      all_shape_.push_back(getTensorShape(tensor));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (tflite::RuntimeShape &shape : all_shape_)
+    {
+      all_shape_ptr_.push_back(&shape);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  ElementT *const *data() const { return all_data_.data(); }
+
+  // Return a pointer the shape pointers of all tensors in the list. For
+  // example:
+  //   const RuntimeShape* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const tflite::RuntimeShape *const *shapes() const { return all_shape_ptr_.data(); }
+
+private:
+  std::vector<ElementT *> all_data_;
+  std::vector<tflite::RuntimeShape> all_shape_;
+  std::vector<tflite::RuntimeShape *> all_shape_ptr_;
+};
+
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+template <bool is_const> class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t, is_const>
+{
+public:
+  using typename VectorOfTensors<uint8_t, is_const>::TensorT;
+
+  // Build with the tensors in 'tensor_list'.
+  explicit VectorOfQuantizedTensors(const std::vector<TensorT *> &tensor_list)
+    : VectorOfTensors<uint8_t, is_const>(tensor_list)
+  {
+    for (TensorT *tensor : tensor_list)
+    {
+      zero_point_.push_back(tensor->zero_point());
+      scale_.push_back(tensor->scale());
+    }
+  }
+
+  const float *scale() const { return scale_.data(); }
+  const int32_t *zero_point() const { return zero_point_.data(); }
+
+private:
+  std::vector<int32_t> zero_point_;
+  std::vector<float> scale_;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_UTILS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/While.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/While.cpp
new file mode 100644
index 000000000..153bd1a99
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/While.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/While.h"
+#include "kernels/Utils.h"
+
+#include <cstring>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+namespace
+{
+
+void copy(const std::vector<const Tensor *> &src, const std::vector<Tensor *> &dst)
+{
+  for (size_t i = 0; i < src.size(); ++i)
+  {
+    LUCI_INTERPRETER_CHECK(dst[i]->element_type() == src[i]->element_type());
+    dst[i]->resize(src[i]->shape());
+
+    const int32_t num_elements = src[i]->shape().num_elements();
+    const std::size_t element_size = getDataTypeSize(src[i]->element_type());
+    std::memcpy(dst[i]->data<void>(), src[i]->data<void>(), num_elements * element_size);
+  }
+}
+
+void copy(const std::vector<Tensor *> &src, const std::vector<Tensor *> &dst)
+{
+  std::vector<const Tensor *> const_src;
+  for (const auto &t : src)
+    const_src.push_back(t);
+  copy(const_src, dst);
+}
+
+// TODO: Think about how allocate memory for output in main graph
+void configureTensorsAllocations(const std::vector<Tensor *> &tensors, RuntimeGraph *run_graph)
+{
+  for (auto tensor : tensors)
+    run_graph->configureAllocations(tensor);
+}
+
+} // namespace
+
+While::While(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs,
+             RuntimeGraph *cond_graph, RuntimeGraph *body_graph)
+  : Kernel(std::move(inputs), std::move(outputs)), _cond_graph(cond_graph), _body_graph(body_graph)
+{
+}
+
+void While::configure()
+{
+  LUCI_INTERPRETER_CHECK(_body_graph->getInputTensors().size() == getInputTensors().size());
+  LUCI_INTERPRETER_CHECK(_body_graph->getOutputTensors().size() == getOutputTensors().size());
+  LUCI_INTERPRETER_CHECK(_body_graph->getOutputTensors().size() == getInputTensors().size());
+
+  LUCI_INTERPRETER_CHECK(_cond_graph->getInputTensors().size() == getInputTensors().size());
+
+  const auto &cond_outputs = _cond_graph->getOutputTensors();
+  LUCI_INTERPRETER_CHECK(cond_outputs.size() == 1)
+  LUCI_INTERPRETER_CHECK(cond_outputs[0]->element_type() == DataType::BOOL);
+}
+
+/**
+ * @note Dynamic shape such as {1, 0, 8} may fail in tensor->data()
+ */
+void While::execute() const
+{
+  const auto &cond_inputs = _cond_graph->getInputTensors();
+  const auto &cond_outputs = _cond_graph->getOutputTensors();
+
+  configureTensorsAllocations(cond_inputs, _cond_graph);
+
+  copy(getInputTensors(), cond_inputs);
+
+  const auto &body_inputs = _body_graph->getInputTensors();
+  const auto &body_outputs = _body_graph->getOutputTensors();
+
+  configureTensorsAllocations(body_inputs, _body_graph);
+
+  while (true)
+  {
+    _cond_graph->execute();
+
+    bool cond_value = cond_outputs[0]->data<bool>()[0];
+    if (!cond_value)
+      break;
+
+    copy(cond_inputs, body_inputs);
+
+    _body_graph->execute();
+
+    copy(body_outputs, cond_inputs);
+  }
+
+  copy(cond_inputs, getOutputTensors());
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/While.h b/compiler/luci-micro/luci-interpreter/src/kernels/While.h
new file mode 100644
index 000000000..f758df3f3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/While.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_WHILE_H
+#define LUCI_INTERPRETER_KERNELS_WHILE_H
+
+#include "core/Kernel.h"
+#include "core/RuntimeGraph.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class While : public Kernel
+{
+public:
+  While(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs, RuntimeGraph *cond_graph,
+        RuntimeGraph *body_graph);
+
+  const Tensor *input(int index) const { return _inputs[index]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  RuntimeGraph *const _cond_graph = nullptr;
+  RuntimeGraph *const _body_graph = nullptr;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_WHILE_H
diff --git a/compiler/luci-micro/luci-interpreter/src/kernels/While.test.cpp b/compiler/luci-micro/luci-interpreter/src/kernels/While.test.cpp
new file mode 100644
index 000000000..cb8f89130
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/kernels/While.test.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "core/RuntimeModule.h"
+#include "kernels/Add.h"
+#include "kernels/Less.h"
+#include "kernels/While.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+RuntimeGraph *buildCondSubgraph(RuntimeModule *module, DataType dtype, Tensor *input_cond,
+                                IMemoryManager *memory_manager)
+{
+  RuntimeGraph *graph = module->addGraph(memory_manager);
+  Tensor *input =
+    graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
+  Tensor *output =
+    graph->addTensor(std::make_unique<Tensor>(DataType::BOOL, Shape{}, AffineQuantization{}, ""));
+
+  memory_manager->allocate_memory(*input);
+  memory_manager->allocate_memory(*output);
+
+  graph->setInputTensors({input});
+  graph->setOutputTensors({output});
+
+  graph->addKernel(std::make_unique<Less>(input, input_cond, output));
+
+  return graph;
+}
+
+RuntimeGraph *buildBodySubgraph(RuntimeModule *module, DataType dtype, Tensor *input_add,
+                                IMemoryManager *memory_manager)
+{
+  RuntimeGraph *graph = module->addGraph(memory_manager);
+  Tensor *input =
+    graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
+  Tensor *output =
+    graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
+
+  memory_manager->allocate_memory(*input);
+  memory_manager->allocate_memory(*output);
+
+  graph->setInputTensors({input});
+  graph->setOutputTensors({output});
+
+  AddParams params{};
+  params.activation = Activation::NONE;
+  graph->addKernel(std::make_unique<Add>(input, input_add, output, params));
+
+  return graph;
+}
+
+TEST(WhileTest, FloatLoop10)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input = makeInputTensor<DataType::FLOAT32>({1}, {1}, memory_manager.get());
+  Tensor output = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor input_cond = makeInputTensor<DataType::FLOAT32>({1}, {10}, memory_manager.get());
+  Tensor input_add = makeInputTensor<DataType::FLOAT32>({1}, {1}, memory_manager.get());
+
+  RuntimeModule module(nullptr);
+  RuntimeGraph *cond_graph =
+    buildCondSubgraph(&module, DataType::FLOAT32, &input_cond, memory_manager.get());
+  RuntimeGraph *body_graph =
+    buildBodySubgraph(&module, DataType::FLOAT32, &input_add, memory_manager.get());
+
+  While kernel({&input}, {&output}, cond_graph, body_graph);
+  kernel.configure();
+  memory_manager->allocate_memory(output);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({10}));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-micro/luci-interpreter/src/loader/CMakeLists.txt
new file mode 100644
index 000000000..292771592
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/CMakeLists.txt
@@ -0,0 +1,39 @@
+set(SOURCES
+    GraphLoader.h
+    GraphLoader.cpp
+    KernelBuilderHelper.h
+    KernelBuilderHelper.cpp
+    KernelBuilder.h
+    KernelBuilder.cpp
+    ModuleLoader.h
+    ModuleLoader.cpp
+    RuntimeToIR.h
+    nodes/Builders.h)
+
+# include kernel specific builders
+macro(REGISTER_KERNEL NODE)
+  list(APPEND SOURCES "nodes/${NODE}.cpp")
+endmacro(REGISTER_KERNEL)
+include(${KERNEL_REGISTER_FILE})
+
+add_library(${LUCI_INTERPRETER_LOADER} STATIC ${SOURCES})
+if (NOT NNCC_LIBRARY_NO_PIC)
+  set_target_properties(${LUCI_INTERPRETER_LOADER} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif(NOT NNCC_LIBRARY_NO_PIC)
+target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_PAL_DIR}")
+target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
+
+target_link_libraries(${LUCI_INTERPRETER_LOADER}
+        PUBLIC luci_lang ${LUCI_INTERPRETER_CORE}
+        PRIVATE ${LUCI_INTERPRETER_KERNELS} nncc_common luci_plan)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+set(TEST_SOURCES KernelBuilder.test.cpp)
+
+GTest_AddTest(${LUCI_INTERPRETER_LOADER}_test ${TEST_SOURCES})
+target_link_libraries(${LUCI_INTERPRETER_LOADER}_test ${LUCI_INTERPRETER_LOADER})
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.cpp
new file mode 100644
index 000000000..40207090b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/GraphLoader.h"
+
+#include "loader/KernelBuilder.h"
+
+#include <luci/Plan/CircleNodeExecutionPlan.h>
+#include <loco/IR/Algorithm.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+template <typename NodeT> Shape getNodeShape(const NodeT *node)
+{
+  Shape shape(node->rank());
+  for (uint32_t i = 0; i < node->rank(); ++i)
+  {
+    shape.dim(i) = node->dim(i).value();
+  }
+  return shape;
+}
+
+template <DataType DT> const void *getNodeDataImpl(const luci::CircleConst *node, size_t *data_size)
+{
+  const size_t element_size = getDataTypeSize(DT);
+  const int32_t num_elements = node->size<DT>();
+
+  *data_size = num_elements * element_size;
+  if (*data_size > 0)
+  {
+    // FIXME There is no good way to get the pointer to the data currently.
+    return &node->at<DT>(0);
+  }
+  return nullptr;
+}
+
+const void *getNodeData(const luci::CircleConst *node, size_t *data_size)
+{
+  switch (node->dtype())
+  {
+    case DataType::U8:
+      return getNodeDataImpl<DataType::U8>(node, data_size);
+    case DataType::FLOAT32:
+      return getNodeDataImpl<DataType::FLOAT32>(node, data_size);
+    case DataType::S8:
+      return getNodeDataImpl<DataType::S8>(node, data_size);
+    case DataType::S16:
+      return getNodeDataImpl<DataType::S16>(node, data_size);
+    case DataType::S32:
+      return getNodeDataImpl<DataType::S32>(node, data_size);
+    case DataType::S64:
+      return getNodeDataImpl<DataType::S64>(node, data_size);
+    case DataType::BOOL:
+      return getNodeDataImpl<DataType::BOOL>(node, data_size);
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+const void *getNodeData(const luci::CircleCustom *node, size_t *data_size)
+{
+  if (node->custom_code() != "CircleReferencingConst")
+    return nullptr;
+
+  // helper struct which describes data loaded to custom_options of CircleReferencingConst node
+  // TODO move this struct to header
+  struct ConstDataReference
+  {
+    const uint8_t *data = nullptr;
+    uint32_t size = 0;
+  };
+
+  const auto &custom_options = node->custom_options();
+  const auto &const_data_ref = *reinterpret_cast<const ConstDataReference *>(custom_options.data());
+
+  *data_size = const_data_ref.size;
+  return const_data_ref.data;
+}
+
+bool isExecutableNode(const luci::CircleNode *node)
+{
+  switch (node->opcode())
+  {
+    // These nodes denote inputs / outputs of a graph.
+    case luci::CircleOpcode::CIRCLECONST:
+    case luci::CircleOpcode::CIRCLEINPUT:
+    case luci::CircleOpcode::CIRCLEOUTPUT:
+    case luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE:
+    // The following nodes denote outputs of multiple-output nodes.
+    case luci::CircleOpcode::CIRCLEBIDIRECTIONAL_SEQUENCE_LSTM_OUT:
+    case luci::CircleOpcode::CIRCLECUSTOMOUT:
+    case luci::CircleOpcode::CIRCLEIFOUT:
+    case luci::CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT:
+    case luci::CircleOpcode::CIRCLENONMAXSUPPRESSIONV5OUT:
+    case luci::CircleOpcode::CIRCLESPLITOUT:
+    case luci::CircleOpcode::CIRCLESPLITVOUT:
+    case luci::CircleOpcode::CIRCLETOPKV2OUT:
+    case luci::CircleOpcode::CIRCLEUNIQUEOUT:
+    case luci::CircleOpcode::CIRCLEUNPACKOUT:
+    case luci::CircleOpcode::CIRCLEVARIABLE:
+    case luci::CircleOpcode::CIRCLEWHILEOUT:
+      return false;
+    // Custom nodes may be executable and non-executable
+    case luci::CircleOpcode::CUSTOM:
+    {
+      auto const custom_node = loco::must_cast<const luci::CircleCustom *>(node);
+
+      // TODO handle more non-executable Custom ops here
+      if (custom_node->custom_code() == "CircleReferencingConst")
+        return false;
+
+      return true;
+    }
+    default:
+      return true;
+  }
+}
+
+bool isTensorProducingNode(const luci::CircleNode *node)
+{
+  switch (node->opcode())
+  {
+    // Output nodes do not produce tensors.
+    case luci::CircleOpcode::CIRCLEOUTPUT:
+    // The following nodes are multiple-output nodes. They do not produce tensors, the tensors
+    // are produced by the corresponding *Out nodes instead.
+    case luci::CircleOpcode::BIDIRECTIONAL_SEQUENCE_LSTM:
+    case luci::CircleOpcode::CUSTOM:
+    case luci::CircleOpcode::IF:
+    case luci::CircleOpcode::NON_MAX_SUPPRESSION_V4:
+    case luci::CircleOpcode::NON_MAX_SUPPRESSION_V5:
+    case luci::CircleOpcode::SPLIT:
+    case luci::CircleOpcode::SPLIT_V:
+    case luci::CircleOpcode::TOPK_V2:
+    case luci::CircleOpcode::UNIQUE:
+    case luci::CircleOpcode::UNPACK:
+    case luci::CircleOpcode::WHILE:
+      return false;
+    default:
+      return true;
+  }
+}
+
+bool isSupportedCustomNode(const luci::CircleNode *node)
+{
+  const auto custom_node = loco::must_cast<const luci::CircleCustom *>(node);
+
+  // TODO handle more Custom ops here
+  if (custom_node->custom_code() == "CircleReferencingConst")
+    return true;
+
+  return false;
+}
+
+} // namespace
+
+GraphLoader::GraphLoader(
+  const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+  std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor, IMemoryManager *memory_manager)
+  : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
+    _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor),
+    _memory_manager(memory_manager)
+{
+}
+
+void GraphLoader::loadTensors()
+{
+  for (uint32_t i = 0; i < _graph->nodes()->size(); ++i)
+  {
+    const auto *node = loco::must_cast<const luci::CircleNode *>(_graph->nodes()->at(i));
+
+    if (node->opcode() == luci::CircleOpcode::CUSTOM && !isSupportedCustomNode(node))
+      throw std::runtime_error("Unsupported Custom operator. " + node->name());
+
+    if (!isTensorProducingNode(node))
+      continue;
+
+    // Only Input, Const, Custom and Variable nodes have shapes. Shapes of intermediate tensors will
+    // be inferred.
+    Shape shape{};
+    switch (node->opcode())
+    {
+      case luci::CircleOpcode::CIRCLECONST:
+      case luci::CircleOpcode::CIRCLECUSTOMOUT:
+      case luci::CircleOpcode::CIRCLEINPUT:
+      case luci::CircleOpcode::CIRCLEVARIABLE:
+        shape = getNodeShape(node);
+        break;
+      default:
+        break;
+    }
+
+    AffineQuantization quantization;
+    if (node->quantparam() != nullptr)
+    {
+      const luci::CircleQuantParam *params = node->quantparam();
+      assert(params->scale.size() == params->zerop.size());
+      quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
+      quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
+      quantization.quantized_dimension = params->quantized_dimension;
+    }
+
+    auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
+                                           node->name());
+
+    // If node has execution plan then read memory offsets for nodes
+    // from the beginning of shared memory buffer. Used in Static Memory Manager.
+    if (luci::has_execution_plan(node))
+    {
+      auto execution_plan = luci::get_execution_plan(node);
+      assert(!execution_plan.offsets().empty());
+      tensor->set_offset(execution_plan.offsets().front());
+    }
+
+    if (const auto *const_node = dynamic_cast<const luci::CircleConst *>(node))
+    {
+      size_t data_size{};
+      const void *const_data = getNodeData(const_node, &data_size);
+      if (const_data != nullptr)
+      {
+        _memory_manager->allocate_memory(*tensor);
+        tensor->writeData(const_data, data_size);
+      }
+    }
+    else if (const auto *custom_out_node = dynamic_cast<const luci::CircleCustomOut *>(node))
+    {
+      const auto *custom_node =
+        loco::must_cast<const luci::CircleCustom *>(custom_out_node->input());
+
+      if (custom_node->custom_code() == "CircleReferencingConst")
+      {
+        size_t data_size{};
+        const void *const_data = getNodeData(custom_node, &data_size);
+        if (const_data != nullptr)
+        {
+          _memory_manager->allocate_memory(*tensor);
+          tensor->writeData(const_data, data_size);
+        }
+      }
+    }
+
+    _node_to_tensor.emplace(node, tensor.get());
+    _runtime_to_ir.tensor_to_node.emplace(tensor.get(), node);
+
+    _runtime_graph->addTensor(std::move(tensor));
+  }
+}
+
+void GraphLoader::initInputOutputTensors() const
+{
+  auto input_nodes = loco::input_nodes(_graph);
+  std::vector<Tensor *> input_tensors(input_nodes.size());
+  for (size_t i = 0; i < input_nodes.size(); ++i)
+  {
+    input_tensors[i] = _node_to_tensor.at(input_nodes[i]);
+    _memory_manager->allocate_memory(*input_tensors[i]);
+  }
+  _runtime_graph->setInputTensors(input_tensors);
+
+  auto output_nodes = loco::output_nodes(const_cast<loco::Graph *>(_graph));
+  std::vector<Tensor *> output_tensors(output_nodes.size());
+  for (size_t i = 0; i < output_nodes.size(); ++i)
+  {
+    const auto *node = loco::must_cast<const luci::CircleOutput *>(output_nodes[i]);
+    output_tensors[i] = _node_to_tensor.at(node->from());
+  }
+  _runtime_graph->setOutputTensors(output_tensors);
+}
+
+void GraphLoader::loadOperators()
+{
+  KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
+
+  // Create kernels for executable nodes. This has to be done in execution order.
+  auto graph = const_cast<loco::Graph *>(_graph);
+
+  auto const graph_nodes = loco::all_nodes(graph);
+
+  // Checking for execution plan in node annotations.
+  bool has_execution_annotation = true;
+  auto const checking_exec_plan = [&has_execution_annotation](auto const node) {
+    const auto *circle_node = loco::must_cast<const luci::CircleNode *>(node);
+    if (!luci::has_execution_plan(circle_node))
+      has_execution_annotation = false;
+  };
+  std::for_each(begin(graph_nodes), end(graph_nodes), checking_exec_plan);
+
+  if (has_execution_annotation)
+  {
+    // Build ordered_nodes vector that stores the order of execution of graph nodes.
+    std::vector<const luci::CircleNode *> ordered_nodes(graph_nodes.size());
+
+    auto const filler = [&ordered_nodes](auto const node) {
+      const auto *circle_node = loco::must_cast<const luci::CircleNode *>(node);
+      auto const position = luci::get_execution_plan(circle_node).order_in_plan();
+      ordered_nodes.at(position) = circle_node;
+    };
+    std::for_each(begin(graph_nodes), end(graph_nodes), filler);
+
+    for (auto node : ordered_nodes)
+    {
+      if (isExecutableNode(node))
+      {
+        std::unique_ptr<Kernel> kernel = kernel_builder.build(node);
+        _runtime_to_ir.kernel_to_node.emplace(kernel.get(), node);
+        _runtime_graph->addKernel(std::move(kernel));
+      }
+    }
+  }
+  else
+  {
+    // If it is impossible to build the execution order plan,
+    // then we use the default postorder_traversal approach.
+    for (const loco::Node *loco_node : loco::postorder_traversal(loco::output_nodes(graph)))
+    {
+      const auto *node = loco::must_cast<const luci::CircleNode *>(loco_node);
+      if (isExecutableNode(node))
+      {
+        std::unique_ptr<Kernel> kernel = kernel_builder.build(node);
+        _runtime_to_ir.kernel_to_node.emplace(kernel.get(), node);
+        _runtime_graph->addKernel(std::move(kernel));
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.h
new file mode 100644
index 000000000..fe066ecf8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/GraphLoader.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_GRAPHLOADER_H
+#define LUCI_INTERPRETER_LOADER_GRAPHLOADER_H
+
+#include "core/RuntimeGraph.h"
+#include "loader/RuntimeToIR.h"
+#include "luci_interpreter/MemoryManager.h"
+
+#include <loco/IR/Graph.h>
+
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class GraphLoader
+{
+public:
+  GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+              const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+              std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+              IMemoryManager *memory_manager);
+
+  void loadTensors();
+  void initInputOutputTensors() const;
+  void loadOperators();
+
+private:
+  const loco::Graph *_graph;
+  RuntimeGraph *_runtime_graph;
+  RuntimeToIR &_runtime_to_ir;
+  IMemoryManager *_memory_manager;
+
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+  std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_GRAPHLOADER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.cpp
new file mode 100644
index 000000000..8483a9a3d
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/KernelBuilder.h"
+#include "loader/nodes/Builders.h"
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+#define CIRCLE_NODE(OPCODE, CLASS) CLASS,
+#define CIRCLE_VNODE(OPCODE, CLASS) CLASS,
+
+// This enum is auxiliary.
+// It is duplicate of luci::CircleOpcode but initialized with CLASS instead of OPCODE,
+// because list of target operators is in format of CLASS names
+enum class BuilderId
+{
+#include <luci/IR/CircleNodes.lst>
+  Size // casts to count of values in BuilderId enum
+};
+
+#undef CIRCLE_VNODE
+#undef CIRCLE_NODE
+
+/**
+ * @brief Registry of kernel builders
+ *
+ * This class contains mapping from Opcodes to kernel builder functions
+ */
+
+class KernelBuilderRegistry
+{
+public:
+  using KernelBuilderFunc = std::unique_ptr<Kernel>(const luci::CircleNode *,
+                                                    KernelBuilderHelper &);
+
+  KernelBuilderRegistry() : _operator_builders(size_t(BuilderId::Size), nullptr)
+  {
+#define REGISTER_KERNEL(name) \
+  register_kernel_builder(BuilderId::Circle##name, build_kernel_Circle##name);
+
+#include "KernelsToBuild.lst"
+
+#undef REGISTER_KERNEL
+  }
+
+  KernelBuilderFunc *get_kernel_builder_func(luci::CircleOpcode opcode) const
+  {
+    return _operator_builders.at(size_t(opcode));
+  }
+
+private:
+  std::vector<KernelBuilderFunc *> _operator_builders;
+
+  void register_kernel_builder(BuilderId id, KernelBuilderFunc *func)
+  {
+    // Using BuilderId is a duplicate of luci::CirclreOpcode,
+    // size_t(id) is equal to size_t(corresponding operation opcode).
+    assert(size_t(id) < _operator_builders.size());
+    _operator_builders[size_t(id)] = func;
+  }
+};
+
+KernelBuilder::KernelBuilder(
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+  const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+  : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
+{
+  _builder_registry = std::make_unique<KernelBuilderRegistry>();
+}
+
+KernelBuilder::~KernelBuilder()
+{
+  // Need to define in this CPP to hide KernelBuilderRegistry internals.
+  // This destructor deletes _builder_registry
+}
+
+std::unique_ptr<Kernel> KernelBuilder::build(const luci::CircleNode *node)
+{
+  auto specific_builder = _builder_registry->get_kernel_builder_func(node->opcode());
+  if (specific_builder != nullptr)
+    return specific_builder(node, *this);
+
+  std::string msg = "Unsupported operator: ";
+  msg += std::to_string(static_cast<uint32_t>(node->opcode())) + " " + std::string(node->name());
+  throw std::invalid_argument(msg.c_str());
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.h
new file mode 100644
index 000000000..b1f383394
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_KERNELBUILDER_H
+#define LUCI_INTERPRETER_LOADER_KERNELBUILDER_H
+
+#include "loader/KernelBuilderHelper.h"
+
+#include "core/Kernel.h"
+#include "core/RuntimeGraph.h"
+
+#include <luci/IR/CircleNodeVisitor.h>
+
+#include <memory>
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class KernelBuilderRegistry;
+
+class KernelBuilder : public KernelBuilderHelper
+{
+public:
+  KernelBuilder(
+    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+
+  ~KernelBuilder();
+
+  std::unique_ptr<Kernel> build(const luci::CircleNode *node);
+
+private:
+  std::unique_ptr<KernelBuilderRegistry> _builder_registry;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_KERNELBUILDER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.test.cpp
new file mode 100644
index 000000000..b221b6921
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -0,0 +1,1376 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/GraphLoader.h"
+#include "loader/KernelBuilder.h"
+#include "luci_interpreter/SimpleMemoryManager.h"
+
+#include <kernels/Add.h>
+#include <kernels/ArgMax.h>
+#include <kernels/AveragePool2D.h>
+#include <kernels/BatchMatMul.h>
+#include <kernels/Cast.h>
+#include <kernels/Concatenation.h>
+#include <kernels/Conv2D.h>
+#include <kernels/DepthToSpace.h>
+#include <kernels/DepthwiseConv2D.h>
+#include <kernels/Div.h>
+#include <kernels/Elu.h>
+#include <kernels/Exp.h>
+#include <kernels/Floor.h>
+#include <kernels/FloorDiv.h>
+#include <kernels/Equal.h>
+#include <kernels/FullyConnected.h>
+#include <kernels/Greater.h>
+#include <kernels/GreaterEqual.h>
+#include <kernels/InstanceNorm.h>
+#include <kernels/L2Normalize.h>
+#include <kernels/L2Pool2D.h>
+#include <kernels/LeakyRelu.h>
+#include <kernels/Less.h>
+#include <kernels/LessEqual.h>
+#include <kernels/LocalResponseNormalization.h>
+#include <kernels/LogicalAnd.h>
+#include <kernels/LogicalNot.h>
+#include <kernels/LogicalOr.h>
+#include <kernels/Logistic.h>
+#include <kernels/LogSoftmax.h>
+#include <kernels/Maximum.h>
+#include <kernels/MaxPool2D.h>
+#include <kernels/Mean.h>
+#include <kernels/Minimum.h>
+#include <kernels/Mul.h>
+#include <kernels/Neg.h>
+#include <kernels/NotEqual.h>
+#include <kernels/OneHot.h>
+#include <kernels/Pad.h>
+#include <kernels/PadV2.h>
+#include <kernels/Pow.h>
+#include <kernels/PRelu.h>
+#include <kernels/Relu.h>
+#include <kernels/Relu6.h>
+#include <kernels/Reshape.h>
+#include <kernels/ResizeBilinear.h>
+#include <kernels/ResizeNearestNeighbor.h>
+#include <kernels/ReverseV2.h>
+#include <kernels/Rsqrt.h>
+#include <kernels/Slice.h>
+#include <kernels/Softmax.h>
+#include <kernels/SpaceToDepth.h>
+#include <kernels/Split.h>
+#include <kernels/SplitV.h>
+#include <kernels/Sqrt.h>
+#include <kernels/SquaredDifference.h>
+#include <kernels/Squeeze.h>
+#include <kernels/StridedSlice.h>
+#include <kernels/Sub.h>
+#include <kernels/Tanh.h>
+#include <kernels/Transpose.h>
+#include <kernels/TransposeConv.h>
+#include <kernels/Unpack.h>
+
+#include <gmock/gmock.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+class KernelBuilderTest : public Test
+{
+protected:
+  luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
+  void SetUp() override { _memory_manager = std::make_unique<SimpleMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+
+  template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
+  {
+    auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
+    // The actual type does not matter for the purpose of the tests.
+    // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
+    //  actual output types).
+    node->dtype(loco::DataType::FLOAT32);
+    return node;
+  }
+
+  template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
+  {
+    auto *node_out = createNode<NodeOutT>();
+    node_out->input(node);
+    node_out->index(index);
+    return node_out;
+  }
+
+  template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
+  {
+    std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
+
+    RuntimeGraph runtime_graph(nullptr, _memory_manager.get());
+    graph_to_runtime_graph[&_graph] = &runtime_graph;
+    RuntimeToIR runtime_to_ir;
+    GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
+                             _node_to_tensor, _memory_manager.get());
+    graph_loader.loadTensors();
+
+    KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
+
+    auto kernel = kernel_builder.build(op);
+    return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
+  }
+
+  void checkTensor(const Tensor *tensor, const loco::Node *node)
+  {
+    EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
+  }
+
+private:
+  loco::Graph _graph;
+  std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
+};
+
+TEST_F(KernelBuilderTest, Add)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleAdd>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Add>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, ArgMax)
+{
+  auto *input = createInputNode();
+  auto *axis = createInputNode();
+
+  auto *op = createNode<luci::CircleArgMax>();
+  op->input(input);
+  op->dimension(axis);
+
+  op->output_type(loco::DataType::FLOAT32);
+
+  auto kernel = buildKernel<kernels::ArgMax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
+}
+
+TEST_F(KernelBuilderTest, AveragePool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleAveragePool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::AveragePool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, BatchMatMul)
+{
+  auto *lhs = createInputNode();
+  auto *rhs = createInputNode();
+
+  auto *op = createNode<luci::CircleBatchMatMul>();
+  op->x(lhs);
+  op->y(rhs);
+  op->adj_x(false);
+  op->adj_y(false);
+
+  auto kernel = buildKernel<kernels::BatchMatMul>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), lhs);
+  checkTensor(kernel->y(), rhs);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().adj_x, Eq(op->adj_x()));
+  EXPECT_THAT(kernel->params().adj_y, Eq(op->adj_y()));
+}
+
+TEST_F(KernelBuilderTest, Cast)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleCast>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Cast>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Concatenation)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleConcatenation>(2);
+  op->values(0, input1);
+  op->values(1, input2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Concatenation>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(0), input1);
+  checkTensor(kernel->input(1), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Conv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+  op->dilation()->h(17);
+  op->dilation()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Conv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, DepthToSpace)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthToSpace>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::DepthToSpace>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
+}
+
+TEST_F(KernelBuilderTest, DepthwiseConv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthwiseConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->depthMultiplier(11);
+  op->stride()->h(13);
+  op->stride()->w(17);
+  op->dilation()->h(19);
+  op->dilation()->w(23);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Div)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleDiv>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Div>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Elu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleElu>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Elu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Exp)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleExp>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Exp>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Floor)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleFloor>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Floor>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FloorDiv)
+{
+  auto *x = createInputNode();
+  auto *y = createInputNode();
+
+  auto *op = createNode<luci::CircleFloorDiv>();
+  op->x(x);
+  op->y(y);
+
+  auto kernel = buildKernel<kernels::FloorDiv>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x);
+  checkTensor(kernel->y(), y);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Equal)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Equal>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FullyConnected)
+{
+  auto *input = createInputNode();
+  auto *weights = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleFullyConnected>();
+  op->input(input);
+  op->weights(weights);
+  op->bias(bias);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::FullyConnected>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->weights(), weights);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Greater)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleGreater>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Greater>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, GreaterEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleGreaterEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::GreaterEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, InstanceNorm)
+{
+  auto *input = createInputNode();
+  auto *gamma = createInputNode();
+  auto *beta = createInputNode();
+
+  auto *op = createNode<luci::CircleInstanceNorm>();
+  op->input(input);
+  op->gamma(gamma);
+  op->beta(beta);
+
+  op->epsilon(1e-05);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::InstanceNorm>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->gamma(), gamma);
+  checkTensor(kernel->beta(), beta);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().epsilon, Eq(op->epsilon()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Normalize)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Normalize>();
+  op->x(input);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Normalize>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Pool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Pool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Pool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, LeakyRelu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLeakyRelu>();
+  op->features(input);
+
+  op->alpha(11.0f);
+
+  auto kernel = buildKernel<kernels::LeakyRelu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+}
+
+TEST_F(KernelBuilderTest, Less)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleLess>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::Less>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LessEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleLessEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::LessEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LocalResponseNormalization)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLocalResponseNormalization>();
+  op->input(input);
+
+  op->radius(11);
+  op->bias(13.0f);
+  op->alpha(15.0f);
+  op->beta(17.0f);
+
+  auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
+  EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, LogicalAnd)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleLogicalAnd>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::LogicalAnd>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LogicalNot)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogicalNot>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::LogicalNot>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LogicalOr)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleLogicalOr>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::LogicalOr>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Logistic)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogistic>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Logistic>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, LogSoftmax)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogSoftmax>();
+  op->logits(input);
+
+  auto kernel = buildKernel<kernels::LogSoftmax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Maximum)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMaximum>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Maximum>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, MaxPool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleMaxPool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::MaxPool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Mean)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleMean>();
+  op->input(input);
+  op->reduction_indices(axes);
+
+  op->keep_dims(true);
+
+  auto kernel = buildKernel<kernels::Mean>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
+}
+
+TEST_F(KernelBuilderTest, Minimum)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMinimum>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Minimum>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Mul)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMul>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Mul>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Neg)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleNeg>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Neg>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, NotEqual)
+{
+  auto *x_input = createInputNode();
+  auto *y_input = createInputNode();
+
+  auto *op = createNode<luci::CircleNotEqual>();
+  op->x(x_input);
+  op->y(y_input);
+
+  auto kernel = buildKernel<kernels::NotEqual>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->x(), x_input);
+  checkTensor(kernel->y(), y_input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, OneHot)
+{
+  auto *indices = createInputNode();
+  auto *depth = createInputNode();
+  auto *on_value = createInputNode();
+  auto *off_value = createInputNode();
+  auto axis = 1;
+
+  auto *op = createNode<luci::CircleOneHot>();
+  op->indices(indices);
+  op->depth(depth);
+  op->on_value(on_value);
+  op->off_value(off_value);
+  op->axis(axis);
+
+  auto kernel = buildKernel<kernels::OneHot>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->indices(), indices);
+  checkTensor(kernel->depth(), depth);
+  checkTensor(kernel->on_value(), on_value);
+  checkTensor(kernel->off_value(), off_value);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, Pad)
+{
+  auto *input = createInputNode();
+  auto *paddings = createInputNode();
+
+  auto *op = createNode<luci::CirclePad>();
+  op->input(input);
+  op->paddings(paddings);
+
+  auto kernel = buildKernel<kernels::Pad>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->paddings(), paddings);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, PadV2)
+{
+  auto *input = createInputNode();
+  auto *paddings = createInputNode();
+  auto *constant_values = createInputNode();
+
+  auto *op = createNode<luci::CirclePadV2>();
+  op->input(input);
+  op->paddings(paddings);
+  op->constant_values(constant_values);
+
+  auto kernel = buildKernel<kernels::PadV2>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->paddings(), paddings);
+  checkTensor(kernel->constant_values(), constant_values);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Pow)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CirclePow>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::Pow>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, PRelu)
+{
+  auto *input = createInputNode();
+  auto *alpha = createInputNode();
+
+  auto *op = createNode<luci::CirclePRelu>();
+  op->input(input);
+  op->alpha(alpha);
+
+  auto kernel = buildKernel<kernels::PRelu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->alpha(), alpha);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Relu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleRelu>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Relu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Relu6)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleRelu6>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Relu6>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Reshape)
+{
+  auto *input = createInputNode();
+  auto *shape = createInputNode();
+
+  auto *op = createNode<luci::CircleReshape>();
+  op->tensor(input);
+  op->shape(shape);
+
+  auto kernel = buildKernel<kernels::Reshape>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->shape(), shape);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, ResizeBilinear)
+{
+  auto *input = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleResizeBilinear>();
+  op->input(input);
+  op->size(size);
+  op->align_corners(true);
+  op->half_pixel_centers(true);
+
+  auto kernel = buildKernel<kernels::ResizeBilinear>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().align_corners, Eq(op->align_corners()));
+  EXPECT_THAT(kernel->params().half_pixel_centers, Eq(op->half_pixel_centers()));
+}
+
+TEST_F(KernelBuilderTest, ResizeNearestNeighbor)
+{
+  auto *input = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleResizeNearestNeighbor>();
+  op->input(input);
+  op->size(size);
+  op->align_corners(true);
+
+  auto kernel = buildKernel<kernels::ResizeNearestNeighbor>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().align_corners, Eq(op->align_corners()));
+  // TODO currently half_pixel_centers are not implemented on CircleResizeNearestNeighbor
+  // after adding, need to be updated.
+}
+
+TEST_F(KernelBuilderTest, ReverseV2)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleReverseV2>();
+  op->tensor(input);
+  op->axis(axes);
+
+  auto kernel = buildKernel<kernels::ReverseV2>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Rsqrt)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleRsqrt>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Rsqrt>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Slice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->size(size);
+
+  auto kernel = buildKernel<kernels::Slice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Softmax)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSoftmax>();
+  op->logits(input);
+
+  op->beta(11.0f);
+
+  auto kernel = buildKernel<kernels::Softmax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, SpaceToDepth)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSpaceToDepth>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::SpaceToDepth>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, op->block_size());
+}
+
+TEST_F(KernelBuilderTest, Split)
+{
+  auto *axis = createInputNode();
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleSplit>();
+  auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
+
+  op->split_dim(axis);
+  op->input(input);
+
+  op->num_split(2);
+
+  auto kernel = buildKernel<kernels::Split>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+}
+
+TEST_F(KernelBuilderTest, SplitV)
+{
+  auto *input = createInputNode();
+  auto *size_splits = createInputNode();
+  auto *axis = createInputNode();
+  auto *op = createNode<luci::CircleSplitV>();
+  auto *output0 = createNodeOut<luci::CircleSplitVOut>(op, 0);
+  auto *output1 = createNodeOut<luci::CircleSplitVOut>(op, 1);
+
+  op->input(input);
+  op->size_splits(size_splits);
+  op->split_dim(axis);
+
+  op->num_split(2);
+
+  auto kernel = buildKernel<kernels::SplitV>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size_splits(), size_splits);
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->output(0), output0);
+  checkTensor(kernel->output(1), output1);
+}
+
+TEST_F(KernelBuilderTest, Sqrt)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSqrt>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Sqrt>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, SquaredDifference)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleSquaredDifference>();
+  op->x(input1);
+  op->y(input2);
+
+  auto kernel = buildKernel<kernels::SquaredDifference>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Squeeze)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSqueeze>();
+  op->input(input);
+
+  op->squeeze_dims({11, 13});
+
+  auto kernel = buildKernel<kernels::Squeeze>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
+}
+
+TEST_F(KernelBuilderTest, StridedSlice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *end = createInputNode();
+  auto *strides = createInputNode();
+
+  auto *op = createNode<luci::CircleStridedSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->end(end);
+  op->strides(strides);
+
+  op->begin_mask(11);
+  op->ellipsis_mask(13);
+  op->end_mask(17);
+  op->new_axis_mask(19);
+  op->shrink_axis_mask(23);
+
+  auto kernel = buildKernel<kernels::StridedSlice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->end(), end);
+  checkTensor(kernel->strides(), strides);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
+  EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
+  EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
+  EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
+  EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
+}
+
+TEST_F(KernelBuilderTest, Sub)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleSub>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Sub>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Tanh)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleTanh>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Tanh>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Transpose)
+{
+  auto *input = createInputNode();
+  auto *perm = createInputNode();
+
+  auto *op = createNode<luci::CircleTranspose>();
+  op->a(input);
+  op->perm(perm);
+
+  auto kernel = buildKernel<kernels::Transpose>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->perm(), perm);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, TransposeConv)
+{
+  auto *output_shape = createInputNode();
+  auto *filter = createInputNode();
+  auto *input = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleTransposeConv>();
+  op->inputSizes(output_shape);
+  op->filter(filter);
+  op->outBackprop(input);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+
+  auto kernel = buildKernel<kernels::TransposeConv>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->output_shape(), output_shape);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  checkTensor(kernel->bias(), bias);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+}
+
+TEST_F(KernelBuilderTest, Unpack)
+{
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleUnpack>();
+  auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
+
+  op->value(input);
+
+  op->num(2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Unpack>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, NonExisting1_NEG)
+{
+  auto *op = createNode<luci::CircleConst>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting2_NEG)
+{
+  auto *op = createNode<luci::CircleInput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting3_NEG)
+{
+  auto *op = createNode<luci::CircleOutput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.cpp b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.cpp
new file mode 100644
index 000000000..23c96a6db
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/KernelBuilderHelper.h"
+
+#include <luci/IR/Nodes/CircleOutput.h>
+
+namespace luci_interpreter
+{
+
+const Tensor *KernelBuilderHelper::getInputTensor(const loco::Node *node) const
+{
+  const Tensor *tensor = _node_to_tensor.at(node);
+  assert(tensor != nullptr);
+  return tensor;
+}
+
+const Tensor *KernelBuilderHelper::getOptionalInputTensor(const loco::Node *node) const
+{
+  if (dynamic_cast<const luci::CircleOutputExclude *>(node))
+  {
+    return nullptr;
+  }
+  return getInputTensor(node);
+}
+
+Tensor *KernelBuilderHelper::getOutputTensor(const loco::Node *node) const
+{
+  Tensor *tensor = _node_to_tensor.at(node);
+  assert(tensor != nullptr);
+  return tensor;
+}
+
+std::vector<Tensor *>
+KernelBuilderHelper::getOutputTensors(const std::vector<const loco::Node *> &nodes) const
+{
+  std::vector<Tensor *> tensors;
+  tensors.reserve(nodes.size());
+  for (const loco::Node *node : nodes)
+    tensors.push_back(getOutputTensor(node));
+  return tensors;
+}
+
+RuntimeGraph *KernelBuilderHelper::getRuntimeGraph(const loco::Graph *graph) const
+{
+  RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+  assert(runtime_graph != nullptr);
+  return runtime_graph;
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.h b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.h
new file mode 100644
index 000000000..d6fb253b1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/KernelBuilderHelper.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_KERNELBUILDER_HELPER_H
+#define LUCI_INTERPRETER_LOADER_KERNELBUILDER_HELPER_H
+
+#include "core/Kernel.h"
+#include "core/RuntimeGraph.h"
+
+#include <loco/IR/Graph.h>
+#include <loco/IR/Node.h>
+
+#include <vector>
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class KernelBuilderHelper
+{
+public:
+  KernelBuilderHelper(
+    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+    : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+  {
+  }
+
+public:
+  const Tensor *getInputTensor(const loco::Node *node) const;
+  const Tensor *getOptionalInputTensor(const loco::Node *node) const;
+
+  Tensor *getOutputTensor(const loco::Node *node) const;
+  std::vector<Tensor *> getOutputTensors(const std::vector<const loco::Node *> &nodes) const;
+
+  RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
+
+public:
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph() const
+  {
+    return _graph_to_runtime_graph;
+  }
+
+  const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor() const
+  {
+    return _node_to_tensor;
+  }
+
+private:
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+  const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+};
+
+template <typename CircleNodeOut>
+std::vector<const loco::Node *> collectOutputNodes(const loco::Node *node)
+{
+  std::vector<const CircleNodeOut *> output_nodes;
+  for (const loco::Node *loco_node : loco::succs(node))
+  {
+    output_nodes.push_back(loco::must_cast<const CircleNodeOut *>(loco_node));
+  }
+  std::sort(output_nodes.begin(), output_nodes.end(),
+            [](const CircleNodeOut *node1, const CircleNodeOut *node2) {
+              return node1->index() < node2->index();
+            });
+  return {output_nodes.cbegin(), output_nodes.cend()};
+}
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_KERNELBUILDER_HELPER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.cpp
new file mode 100644
index 000000000..2f278b087
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModuleLoader.h"
+
+#include "GraphLoader.h"
+
+namespace luci_interpreter
+{
+
+ModuleLoader::ModuleLoader(const luci::Module *module, RuntimeModule *runtime_module,
+                           RuntimeToIR &runtime_to_ir,
+                           std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+                           IMemoryManager *memory_manager)
+  : _module(module), _runtime_module(runtime_module), _runtime_to_ir(runtime_to_ir),
+    _node_to_tensor(node_to_tensor), _memory_manager(memory_manager)
+{
+}
+
+void ModuleLoader::load()
+{
+  // Runtime graphs have to be created in advance, because they will be needed during the loading
+  // process for control flow nodes.
+  for (size_t i = 0; i < _module->size(); ++i)
+  {
+    _graph_to_runtime_graph.emplace(_module->graph(i), _runtime_module->addGraph(_memory_manager));
+  }
+  for (size_t i = 0; i < _module->size(); ++i)
+  {
+    const loco::Graph *graph = _module->graph(i);
+    RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+    GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
+                       _node_to_tensor, _memory_manager);
+    loader.loadTensors();
+    loader.initInputOutputTensors();
+    loader.loadOperators();
+  }
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.h
new file mode 100644
index 000000000..11326a2ee
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/ModuleLoader.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_MODULELOADER_H
+#define LUCI_INTERPRETER_LOADER_MODULELOADER_H
+
+#include "core/RuntimeModule.h"
+#include "loader/RuntimeToIR.h"
+#include "luci_interpreter/MemoryManager.h"
+
+#include <luci/IR/Module.h>
+
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+class ModuleLoader
+{
+public:
+  ModuleLoader(const luci::Module *module, RuntimeModule *runtime_module,
+               RuntimeToIR &runtime_to_ir,
+               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+               IMemoryManager *memory_manager);
+
+  void load();
+
+private:
+  IMemoryManager *_memory_manager;
+  const luci::Module *_module;
+  RuntimeModule *_runtime_module;
+  RuntimeToIR &_runtime_to_ir;
+  std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+  std::unordered_map<const loco::Graph *, RuntimeGraph *> _graph_to_runtime_graph;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_MODULELOADER_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/RuntimeToIR.h b/compiler/luci-micro/luci-interpreter/src/loader/RuntimeToIR.h
new file mode 100644
index 000000000..9ea8b1fa2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/RuntimeToIR.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_RUNTIMETOIR_H
+#define LUCI_INTERPRETER_LOADER_RUNTIMETOIR_H
+
+#include "luci_interpreter/core/Tensor.h"
+
+#include <luci/IR/CircleNode.h>
+
+#include <unordered_map>
+
+namespace luci_interpreter
+{
+
+// Maps runtime entities back to IR entities. It is used to implement observing functionality.
+struct RuntimeToIR
+{
+  std::unordered_map<const Tensor *, const luci::CircleNode *> tensor_to_node;
+  std::unordered_map<const Kernel *, const luci::CircleNode *> kernel_to_node;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_RUNTIMETOIR_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Add.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Add.cpp
new file mode 100644
index 000000000..501e84752
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Add.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Add.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleAdd(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleAdd *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  AddParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Add>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/ArgMax.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ArgMax.cpp
new file mode 100644
index 000000000..f3ca55744
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ArgMax.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ArgMax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleArgMax(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleArgMax *>(circle_node);
+  assert(node->arity() == 2);
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axis = helper.getInputTensor(node->dimension());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ArgMaxParams params{};
+  params.output_type = node->output_type();
+
+  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/AveragePool2D.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
new file mode 100644
index 000000000..a8135706f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/AveragePool2D.h"
+#include <luci/Plan/CircleNodeExecutionPlan.h>
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleAveragePool2D(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleAveragePool2D *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  // It is unknown what data will be stored in scratchpad tensor,
+  // using UINT8 as a most general option
+  auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, "");
+  scratchpad->set_observable(false);
+  scratchpad->set_data_buffer(nullptr);
+  // If node has execution plan then read memory offsets for scratchpad temporary tensor
+  // from the beginning of shared memory buffer.
+  // Used in Static Memory Manager.
+  // TODO move tensors offset initialization to one place
+  if (luci::has_execution_plan(node))
+  {
+    const auto execution_plan = luci::get_execution_plan(node);
+    // Check whether the offset for the current CircleConv2D temporary was found.
+    if (execution_plan.offsets().size() > 1)
+      // If this is true, then we keep this offset in scratchpad.
+      scratchpad->set_offset(execution_plan.offsets().at(1));
+  }
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad));
+
+  return std::make_unique<kernels::AveragePool2D>(input, output, tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchMatMul.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
new file mode 100644
index 000000000..9da2f6d93
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/BatchMatMul.h"
+#include <luci/Plan/CircleNodeExecutionPlan.h>
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleBatchMatMul(const luci::CircleNode *circle_node,
+                                                       KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleBatchMatMul *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *lhs = helper.getInputTensor(node->x());
+  const Tensor *rhs = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto lhs_scratchpad =
+    std::make_unique<Tensor>(lhs->element_type(), Shape({}), AffineQuantization{}, "");
+  lhs_scratchpad->set_observable(false);
+  lhs_scratchpad->set_data_buffer(nullptr);
+  auto rhs_scratchpad =
+    std::make_unique<Tensor>(rhs->element_type(), Shape({}), AffineQuantization{}, "");
+  rhs_scratchpad->set_observable(false);
+  rhs_scratchpad->set_data_buffer(nullptr);
+  // If node has execution plan then read memory offsets for scratchpad temporary tensor
+  // from the beginning of shared memory buffer.
+  // Used in Static Memory Manager.
+  // TODO move tensors offset initialization to one place
+  if (luci::has_execution_plan(node))
+  {
+    const auto execution_plan = luci::get_execution_plan(node);
+    // Check whether the offset for the current BatchMatMul temporary was found.
+    if (execution_plan.offsets().size() > 1)
+    {
+      assert(execution_plan.offsets().size() == 3);
+
+      // If this is true, then we keep this offset in scratchpad.
+      lhs_scratchpad->set_offset(execution_plan.offsets().at(1));
+      rhs_scratchpad->set_offset(execution_plan.offsets().at(2));
+    }
+  }
+  Tensor *lhs_tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(lhs_scratchpad));
+  Tensor *rhs_tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(rhs_scratchpad));
+
+  BatchMatMulParams params;
+  params.adj_x = node->adj_x();
+  params.adj_y = node->adj_y();
+
+  return std::make_unique<kernels::BatchMatMul>(lhs, rhs, output, lhs_tmp, rhs_tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
new file mode 100644
index 000000000..ac6ebb30f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/BatchToSpaceND.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleBatchToSpaceND(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleBatchToSpaceND *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *block_shape = helper.getInputTensor(node->block_shape());
+  const Tensor *crops = helper.getInputTensor(node->crops());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::BatchToSpaceND>(input, block_shape, crops, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Builders.h b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Builders.h
new file mode 100644
index 000000000..eab284008
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Builders.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
+#define LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
+
+#include "loader/KernelBuilderHelper.h"
+
+#include "luci/IR/CircleNodes.h"
+
+namespace luci_interpreter
+{
+
+#define REGISTER_KERNEL(name)                                                            \
+  std::unique_ptr<Kernel> build_kernel_Circle##name(const luci::CircleNode *circle_node, \
+                                                    KernelBuilderHelper &helper);
+
+#include "KernelsToBuild.lst"
+
+#undef REGISTER_KERNEL
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Cast.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Cast.cpp
new file mode 100644
index 000000000..a16354c96
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Cast.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Cast.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleCast(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleCast *>(circle_node);
+
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Cast>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Concatenation.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Concatenation.cpp
new file mode 100644
index 000000000..ba2564ea2
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Concatenation.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Concatenation.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleConcatenation(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleConcatenation *>(circle_node);
+  std::vector<const Tensor *> inputs(node->numValues());
+  for (uint32_t i = 0; i < node->numValues(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->values(i));
+  }
+  Tensor *output = helper.getOutputTensor(node);
+
+  ConcatenationParams params{};
+  params.axis = node->axis();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Concatenation>(std::move(inputs), output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Conv2D.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Conv2D.cpp
new file mode 100644
index 000000000..218165e20
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Conv2D.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Conv2D.h"
+#include <luci/Plan/CircleNodeExecutionPlan.h>
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleConv2D *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  // It is unknown what data will be stored in scratchpad tensor,
+  // using UINT8 as a most general option
+  auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, "");
+  scratchpad->set_observable(false);
+  scratchpad->set_data_buffer(nullptr);
+  // If node has execution plan then read memory offsets for scratchpad temporary tensor
+  // from the beginning of shared memory buffer.
+  // Used in Static Memory Manager.
+  // TODO move tensors offset initialization to one place
+  if (luci::has_execution_plan(node))
+  {
+    const auto execution_plan = luci::get_execution_plan(node);
+    // Check whether the offset for the current CircleConv2D temporary was found.
+    if (execution_plan.offsets().size() > 1)
+      // If this is true, then we keep this offset in scratchpad.
+      scratchpad->set_offset(execution_plan.offsets().at(1));
+  }
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad));
+
+  Conv2DParams params{};
+  params.padding = node->padding();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.dilation_height_factor = node->dilation()->h();
+  params.dilation_width_factor = node->dilation()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Conv2D>(input, filter, bias, output, tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthToSpace.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
new file mode 100644
index 000000000..174946367
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/DepthToSpace.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDepthToSpace(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleDepthToSpace *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DepthToSpaceParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::DepthToSpace>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..8af1e3b58
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/DepthwiseConv2D.h"
+#include <luci/Plan/CircleNodeExecutionPlan.h>
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDepthwiseConv2D(const luci::CircleNode *circle_node,
+                                                           KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *bias = helper.getInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DepthwiseConv2DParams params{};
+  params.padding = node->padding();
+  params.depth_multiplier = node->depthMultiplier();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.dilation_height_factor = node->dilation()->h();
+  params.dilation_width_factor = node->dilation()->w();
+  params.activation = node->fusedActivationFunction();
+
+  // It is unknown what data will be stored in scratchpad tensor,
+  // using UINT8 as a most general option
+  auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, "");
+  scratchpad->set_observable(false);
+  scratchpad->set_data_buffer(nullptr);
+  // If node has execution plan then read memory offsets for scratchpad temporary tensor
+  // from the beginning of shared memory buffer.
+  // Used in Static Memory Manager.
+  // TODO move tensors offset initialization to one place
+  if (luci::has_execution_plan(node))
+  {
+    const auto execution_plan = luci::get_execution_plan(node);
+    // Check whether the offset for the current CircleConv2D temporary was found.
+    if (execution_plan.offsets().size() > 1)
+      // If this is true, then we keep this offset in scratchpad.
+      scratchpad->set_offset(execution_plan.offsets().at(1));
+  }
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad));
+
+  return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Dequantize.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Dequantize.cpp
new file mode 100644
index 000000000..787322e9b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Dequantize.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Dequantize.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDequantize(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleDequantize *>(circle_node);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Dequantize>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Div.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Div.cpp
new file mode 100644
index 000000000..0611dfdab
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Div.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Div.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDiv(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleDiv *>(circle_node);
+  assert(node->arity() == 2);
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DivParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Div>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Elu.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Elu.cpp
new file mode 100644
index 000000000..a79985e3b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Elu.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Elu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleElu(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleElu *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Elu>(input, output);
+}
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Equal.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Equal.cpp
new file mode 100644
index 000000000..59692883f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Equal.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Equal.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleEqual(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+
+{
+  const auto *node = loco::must_cast<const luci::CircleEqual *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Equal>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Exp.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Exp.cpp
new file mode 100644
index 000000000..30d11cb89
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Exp.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Exp.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleExp(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleExp *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Exp>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/ExpandDims.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ExpandDims.cpp
new file mode 100644
index 000000000..9840c34e5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ExpandDims.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ExpandDims.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleExpandDims(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleExpandDims *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axis = helper.getInputTensor(node->axis());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::ExpandDims>(input, axis, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Fill.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Fill.cpp
new file mode 100644
index 000000000..3aefdf1c5
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Fill.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Fill.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFill(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFill *>(circle_node);
+  assert(node->arity() == 2);
+
+  const auto dims = helper.getInputTensor(node->dims());
+  const auto value = helper.getInputTensor(node->value());
+  auto output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Fill>(dims, value, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Floor.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Floor.cpp
new file mode 100644
index 000000000..e0a223116
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Floor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Floor.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFloor(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFloor *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Floor>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/FloorDiv.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/FloorDiv.cpp
new file mode 100644
index 000000000..a45d89e38
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/FloorDiv.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/FloorDiv.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFloorDiv(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFloorDiv *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::FloorDiv>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/FullyConnected.cpp
new file mode 100644
index 000000000..b7b742b8a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/FullyConnected.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/FullyConnected.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFullyConnected *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *weights = helper.getInputTensor(node->weights());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  FullyConnectedParams params{};
+  params.activation = node->fusedActivationFunction();
+  params.keep_num_dims = node->keep_num_dims();
+
+  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Gather.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Gather.cpp
new file mode 100644
index 000000000..2ee2906e0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Gather.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Gather.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGather(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleGather *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *params = helper.getInputTensor(node->params());
+  const Tensor *indices = helper.getInputTensor(node->indices());
+  Tensor *output = helper.getOutputTensor(node);
+
+  GatherParams gparams{};
+  gparams.axis = node->axis();
+  // TODO support batch_dims
+  gparams.batch_dims = 0;
+
+  return std::make_unique<kernels::Gather>(params, indices, output, gparams);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Greater.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Greater.cpp
new file mode 100644
index 000000000..80aa63cf0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Greater.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Greater.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGreater(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleGreater *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Greater>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/GreaterEqual.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
new file mode 100644
index 000000000..272f2843b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/GreaterEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGreaterEqual(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleGreaterEqual *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::GreaterEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/If.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/If.cpp
new file mode 100644
index 000000000..3ac7d4941
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/If.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/If.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleIf(const luci::CircleNode *circle_node,
+                                              KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleIf *>(circle_node);
+  auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
+  assert(node->arity() == 1 + node->input_count());
+  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
+
+  const Tensor *cond = helper.getInputTensor(node->cond());
+  std::vector<const Tensor *> inputs(node->input_count());
+  for (uint32_t i = 0; i < node->input_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->input(i));
+  }
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  RuntimeGraph *then_graph = helper.getRuntimeGraph(node->then_graph());
+  RuntimeGraph *else_graph = helper.getRuntimeGraph(node->else_graph());
+
+  return std::make_unique<kernels::If>(cond, std::move(inputs), std::move(outputs), then_graph,
+                                       else_graph);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/InstanceNorm.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
new file mode 100644
index 000000000..06031e5bc
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/InstanceNorm.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleInstanceNorm(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleInstanceNorm *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *gamma = helper.getInputTensor(node->gamma());
+  const Tensor *beta = helper.getInputTensor(node->beta());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  InstanceNormParams params{};
+  params.epsilon = node->epsilon();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::InstanceNorm>(input, gamma, beta, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Normalize.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Normalize.cpp
new file mode 100644
index 000000000..6e22e6d4e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Normalize.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/L2Normalize.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleL2Normalize(const luci::CircleNode *circle_node,
+                                                       KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleL2Normalize *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  L2NormParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::L2Normalize>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Pool2D.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
new file mode 100644
index 000000000..95b55896f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/L2Pool2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleL2Pool2D(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleL2Pool2D *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::L2Pool2D>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LeakyRelu.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
new file mode 100644
index 000000000..bbf5067b1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LeakyRelu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLeakyRelu(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLeakyRelu *>(circle_node);
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  LeakyReluParams params{};
+  params.alpha = node->alpha();
+
+  return std::make_unique<kernels::LeakyRelu>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Less.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Less.cpp
new file mode 100644
index 000000000..ae914ecc9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Less.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Less.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLess(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLess *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Less>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LessEqual.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LessEqual.cpp
new file mode 100644
index 000000000..f1b424b55
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LessEqual.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LessEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLessEqual(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLessEqual *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LessEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
new file mode 100644
index 000000000..962ca2d7c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LocalResponseNormalization.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel>
+build_kernel_CircleLocalResponseNormalization(const luci::CircleNode *circle_node,
+                                              KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = node->radius();
+  params.bias = node->bias();
+  params.alpha = node->alpha();
+  params.beta = node->beta();
+
+  return std::make_unique<kernels::LocalResponseNormalization>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogSoftmax.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
new file mode 100644
index 000000000..432204115
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogSoftmax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogSoftmax(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLogSoftmax *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->logits());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogSoftmax>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalAnd.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
new file mode 100644
index 000000000..bf3cb671a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalAnd.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalAnd(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLogicalAnd *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalAnd>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalNot.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalNot.cpp
new file mode 100644
index 000000000..fefcd9a06
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalNot.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalNot.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalNot(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLogicalNot *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalNot>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalOr.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalOr.cpp
new file mode 100644
index 000000000..a416cb401
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/LogicalOr.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalOr.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalOr(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLogicalOr *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalOr>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Logistic.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Logistic.cpp
new file mode 100644
index 000000000..4a69deef1
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Logistic.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Logistic.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogistic(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleLogistic *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Logistic>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/MaxPool2D.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
new file mode 100644
index 000000000..f66a206ca
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/MaxPool2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMaxPool2D(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMaxPool2D *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::MaxPool2D>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Maximum.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Maximum.cpp
new file mode 100644
index 000000000..d0bff776a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Maximum.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Maximum.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMaximum(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMaximum *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Maximum>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mean.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mean.cpp
new file mode 100644
index 000000000..0dec63e79
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mean.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Mean.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMean(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMean *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axes = helper.getInputTensor(node->reduction_indices());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto temp_index_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  temp_index_unique->set_observable(false);
+  temp_index_unique->set_data_buffer(nullptr);
+  Tensor *temp_index =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_index_unique));
+
+  auto resolved_axes_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  resolved_axes_unique->set_observable(false);
+  resolved_axes_unique->set_data_buffer(nullptr);
+  Tensor *resolved_axes =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(resolved_axes_unique));
+
+  auto temp_sum_unique =
+    std::make_unique<Tensor>(input->element_type(), Shape({}), AffineQuantization{}, "");
+  temp_sum_unique->set_observable(false);
+  temp_sum_unique->set_data_buffer(nullptr);
+  Tensor *temp_sum = helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_sum_unique));
+
+  ReducerParams params{};
+  params.keep_dims = node->keep_dims();
+
+  return std::make_unique<kernels::Mean>(input, axes, output, temp_index, resolved_axes, temp_sum,
+                                         params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Minimum.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Minimum.cpp
new file mode 100644
index 000000000..1a49c1090
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Minimum.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Minimum.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMinimum(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMinimum *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Minimum>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/MirrorPad.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/MirrorPad.cpp
new file mode 100644
index 000000000..b221b4574
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/MirrorPad.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/MirrorPad.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMirrorPad(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMirrorPad *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  Tensor *output = helper.getOutputTensor(node);
+
+  MirrorPadParams params{};
+  params.mode = node->mode();
+
+  return std::make_unique<kernels::MirrorPad>(input, paddings, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mul.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mul.cpp
new file mode 100644
index 000000000..f9984853a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Mul.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Mul.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMul(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleMul *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  MulParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Mul>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Neg.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Neg.cpp
new file mode 100644
index 000000000..9a9ecf991
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Neg.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Neg.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleNeg(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleNeg *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Neg>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/NotEqual.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/NotEqual.cpp
new file mode 100644
index 000000000..3916a5854
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/NotEqual.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/NotEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleNotEqual(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleNotEqual *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::NotEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/OneHot.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/OneHot.cpp
new file mode 100644
index 000000000..a40160945
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/OneHot.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/OneHot.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleOneHot(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleOneHot *>(circle_node);
+  assert(node->arity() == 4);
+
+  const Tensor *indices = helper.getInputTensor(node->indices());
+  const Tensor *depth = helper.getInputTensor(node->depth());
+  const Tensor *on_value = helper.getInputTensor(node->on_value());
+  const Tensor *off_value = helper.getInputTensor(node->off_value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  OneHotParams params{};
+  params.axis = node->axis();
+
+  return std::make_unique<kernels::OneHot>(indices, depth, on_value, off_value, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/PRelu.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/PRelu.cpp
new file mode 100644
index 000000000..f3d700c95
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/PRelu.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/PRelu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePRelu(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CirclePRelu *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *alpha = helper.getInputTensor(node->alpha());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::PRelu>(input, alpha, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pack.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pack.cpp
new file mode 100644
index 000000000..efc5850e0
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pack.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pack.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePack(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CirclePack *>(circle_node);
+  assert(node->arity() == node->values_count());
+
+  std::vector<const Tensor *> inputs(node->values_count());
+  for (uint32_t i = 0; i < node->values_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->values(i));
+  }
+  Tensor *output = helper.getOutputTensor(node);
+
+  PackParams params{};
+  params.axis = node->axis();
+  params.values_count = node->values_count();
+
+  return std::make_unique<kernels::Pack>(std::move(inputs), output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pad.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pad.cpp
new file mode 100644
index 000000000..67ce997a7
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pad.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pad.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePad(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CirclePad *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Pad>(input, paddings, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/PadV2.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/PadV2.cpp
new file mode 100644
index 000000000..e378a972a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/PadV2.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/PadV2.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePadV2(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CirclePadV2 *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  const Tensor *constant_values = helper.getInputTensor(node->constant_values());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::PadV2>(input, paddings, constant_values, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pow.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pow.cpp
new file mode 100644
index 000000000..d32fc3dbb
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Pow.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pow.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePow(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CirclePow *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Pow>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Quantize.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Quantize.cpp
new file mode 100644
index 000000000..cb36fb6da
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Quantize.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Quantize.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleQuantize(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleQuantize *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Quantize>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu.cpp
new file mode 100644
index 000000000..1d64c1c4e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Relu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRelu(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleRelu *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu6.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu6.cpp
new file mode 100644
index 000000000..e50cd2545
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Relu6.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Relu6.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRelu6(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleRelu6 *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu6>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Reshape.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Reshape.cpp
new file mode 100644
index 000000000..76ddd88a3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Reshape.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Reshape.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReshape(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleReshape *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->tensor());
+  const Tensor *shape = helper.getInputTensor(node->shape());
+  Tensor *output = helper.getOutputTensor(node);
+
+  // NOTE 'newShape' attribute is ignored.
+  return std::make_unique<kernels::Reshape>(input, shape, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
new file mode 100644
index 000000000..dc2b88ad3
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ResizeBilinear.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleResizeBilinear(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleResizeBilinear *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *size = helper.getInputTensor(node->size());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ResizeBilinearParams params{};
+  params.align_corners = node->align_corners();
+  params.half_pixel_centers = node->half_pixel_centers();
+
+  return std::make_unique<kernels::ResizeBilinear>(input, size, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
new file mode 100644
index 000000000..c7058ae78
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ResizeNearestNeighbor.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel>
+build_kernel_CircleResizeNearestNeighbor(const luci::CircleNode *circle_node,
+                                         KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *size = helper.getInputTensor(node->size());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = node->align_corners();
+  // TODO update half_pixel_centers after CircleResizeNearestNeighbor updated
+  // Current CircleResizeNearestNeighbor don't have half_pixel_centers.
+  // default value on current is false.
+  // it need to be updated when CircleResizeNearestNeighbor updated.
+  params.half_pixel_centers = false;
+
+  return std::make_unique<kernels::ResizeNearestNeighbor>(input, size, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/ReverseV2.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ReverseV2.cpp
new file mode 100644
index 000000000..c1a7f5350
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/ReverseV2.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ReverseV2.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReverseV2(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleReverseV2 *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->tensor());
+  const Tensor *axes = helper.getInputTensor(node->axis());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::ReverseV2>(input, axes, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Rsqrt.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Rsqrt.cpp
new file mode 100644
index 000000000..0714a5dba
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Rsqrt.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Rsqrt.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRsqrt(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleRsqrt *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Rsqrt>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/SVDF.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SVDF.cpp
new file mode 100644
index 000000000..d172ef438
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SVDF.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SVDF.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSVDF(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSVDF *>(circle_node);
+  assert(node->arity() == 5);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *feature = helper.getInputTensor(node->weight_feature());
+  const Tensor *time = helper.getInputTensor(node->weight_time());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+  const Tensor *input_activation_state = helper.getInputTensor(node->input_activation_state());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto scratchpad_tensor = std::make_unique<Tensor>(input_activation_state->element_type(),
+                                                    Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  DataType data_type = input->element_type() == DataType::S8 ? DataType::S32 : DataType::FLOAT32;
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_1 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  if (data_type == DataType::FLOAT32 &&
+      (feature->element_type() == DataType::S8 || feature->element_type() == DataType::U8))
+  {
+    data_type = feature->element_type();
+  }
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_2 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  data_type = DataType::FLOAT32;
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_3 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_4 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_5 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, "");
+  scratchpad_tensor->set_observable(false);
+  scratchpad_tensor->set_data_buffer(nullptr);
+  Tensor *tmp_6 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor));
+
+  SVDFParams params{};
+  params.activation = node->fusedActivationFunction();
+  params.svdf_rank = node->svdf_rank();
+  params.asymmetric_quantize_inputs = node->asymmetric_quantize_inputs();
+
+  return std::make_unique<kernels::SVDF>(input, feature, time, bias, input_activation_state, output,
+                                         tmp, tmp_1, tmp_2, tmp_3, tmp_4, tmp_5, tmp_6, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Shape.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Shape.cpp
new file mode 100644
index 000000000..d1edbc794
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Shape.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Shape.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleShape(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleShape *>(circle_node);
+  assert(node->arity() == 1);
+
+  const auto input = helper.getInputTensor(node->input());
+  auto output = helper.getOutputTensor(node);
+
+  ShapeParams shape_params{};
+  shape_params.out_type = node->out_type();
+
+  return std::make_unique<kernels::ShapeKernel>(input, output, shape_params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Slice.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Slice.cpp
new file mode 100644
index 000000000..60ac6417c
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Slice.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Slice.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSlice(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSlice *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *begin = helper.getInputTensor(node->begin());
+  const Tensor *size = helper.getInputTensor(node->size());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Slice>(input, begin, size, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Softmax.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Softmax.cpp
new file mode 100644
index 000000000..f41f63f6f
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Softmax.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Softmax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSoftmax(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSoftmax *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->logits());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SoftmaxParams params{};
+  params.beta = node->beta();
+
+  return std::make_unique<kernels::Softmax>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
new file mode 100644
index 000000000..b6e6cf516
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SpaceToBatchND.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSpaceToBatchND(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSpaceToBatchND *>(circle_node);
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *block_shape = helper.getInputTensor(node->block_shape());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::SpaceToBatchND>(input, block_shape, paddings, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
new file mode 100644
index 000000000..63fdb95ec
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SpaceToDepth.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSpaceToDepth(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSpaceToDepth *>(circle_node);
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->input());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  SpaceToDepthParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::SpaceToDepth>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Split.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Split.cpp
new file mode 100644
index 000000000..3f6d4a7df
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Split.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Split.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSplit(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSplit *>(circle_node);
+  auto output_nodes = collectOutputNodes<luci::CircleSplitOut>(node);
+  assert(node->arity() == 2);
+  assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
+
+  const Tensor *axis = helper.getInputTensor(node->split_dim());
+  const Tensor *input = helper.getInputTensor(node->input());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  // NOTE 'num_splits' attribute is ignored.
+  return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/SplitV.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SplitV.cpp
new file mode 100644
index 000000000..0788822ca
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SplitV.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SplitV.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSplitV(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSplitV *>(circle_node);
+  auto output_nodes = collectOutputNodes<luci::CircleSplitVOut>(node);
+  assert(node->arity() == 3);
+  assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *sizes_data = helper.getInputTensor(node->size_splits());
+  const Tensor *axis = helper.getInputTensor(node->split_dim());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  // NOTE 'num_splits' attribute is ignored.
+  return std::make_unique<kernels::SplitV>(input, sizes_data, axis, std::move(outputs));
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sqrt.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sqrt.cpp
new file mode 100644
index 000000000..b9843fe0b
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sqrt.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Sqrt.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSqrt(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSqrt *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Sqrt>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Square.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Square.cpp
new file mode 100644
index 000000000..0ad7c1772
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Square.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Square.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSquare(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSquare *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Square>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/SquaredDifference.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
new file mode 100644
index 000000000..e4c6fd851
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SquaredDifference.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSquaredDifference(const luci::CircleNode *circle_node,
+                                                             KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSquaredDifference *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::SquaredDifference>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Squeeze.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Squeeze.cpp
new file mode 100644
index 000000000..6885f8077
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Squeeze.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Squeeze.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSqueeze(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSqueeze *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SqueezeParams params{};
+  params.squeeze_dims = node->squeeze_dims();
+
+  return std::make_unique<kernels::Squeeze>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/StridedSlice.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/StridedSlice.cpp
new file mode 100644
index 000000000..359b4e3e9
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/StridedSlice.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/StridedSlice.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleStridedSlice(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleStridedSlice *>(circle_node);
+  assert(node->arity() == 4);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *begin = helper.getInputTensor(node->begin());
+  const Tensor *end = helper.getInputTensor(node->end());
+  const Tensor *strides = helper.getInputTensor(node->strides());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  StridedSliceParams params{};
+  params.begin_mask = node->begin_mask();
+  params.ellipsis_mask = node->ellipsis_mask();
+  params.end_mask = node->end_mask();
+  params.new_axis_mask = node->new_axis_mask();
+  params.shrink_axis_mask = node->shrink_axis_mask();
+
+  return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sub.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sub.cpp
new file mode 100644
index 000000000..a6252cb53
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Sub.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Sub.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSub(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleSub *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SubParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Sub>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Tanh.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Tanh.cpp
new file mode 100644
index 000000000..a58ef60a8
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Tanh.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Tanh.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTanh(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleTanh *>(circle_node);
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Tanh>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Transpose.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Transpose.cpp
new file mode 100644
index 000000000..ea17d8311
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Transpose.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Transpose.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTranspose(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleTranspose *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->a());
+  const Tensor *perm = helper.getInputTensor(node->perm());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Transpose>(input, perm, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/TransposeConv.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/TransposeConv.cpp
new file mode 100644
index 000000000..d773e301e
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/TransposeConv.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/TransposeConv.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTransposeConv(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleTransposeConv *>(circle_node);
+  assert(node->arity() == 4);
+
+  const Tensor *input_sizes = helper.getInputTensor(node->inputSizes());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *out_backprop = helper.getInputTensor(node->outBackprop());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  DataType scratch_data_type =
+    helper.getInputTensor(node)->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+
+  auto scratch_tensor =
+    std::make_unique<Tensor>(scratch_data_type, Shape({}), AffineQuantization{}, "");
+  scratch_tensor->set_observable(false);
+  scratch_tensor->set_data_buffer(nullptr);
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratch_tensor));
+
+  TransposeConvParams params{};
+  params.padding = node->padding();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+
+  return std::make_unique<kernels::TransposeConv>(input_sizes, filter, out_backprop, bias, output,
+                                                  tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/Unpack.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Unpack.cpp
new file mode 100644
index 000000000..a1c0d323a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/Unpack.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Unpack.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleUnpack(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleUnpack *>(circle_node);
+  auto output_nodes = collectOutputNodes<luci::CircleUnpackOut>(node);
+  assert(node->arity() == 1);
+  assert(output_nodes.size() == static_cast<size_t>(node->num()));
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  UnpackParams params{};
+  params.axis = node->axis();
+
+  // NOTE 'num' attribute is ignored.
+  return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/luci-interpreter/src/loader/nodes/While.cpp b/compiler/luci-micro/luci-interpreter/src/loader/nodes/While.cpp
new file mode 100644
index 000000000..8fde6ec8a
--- /dev/null
+++ b/compiler/luci-micro/luci-interpreter/src/loader/nodes/While.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/While.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleWhile(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleWhile *>(circle_node);
+
+  auto output_nodes = collectOutputNodes<luci::CircleWhileOut>(node);
+  assert(node->arity() == node->input_count());
+  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
+
+  std::vector<const Tensor *> inputs(node->input_count());
+  for (uint32_t i = 0; i < node->input_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->input(i));
+  }
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  RuntimeGraph *cond_graph = helper.getRuntimeGraph(node->cond_graph());
+  RuntimeGraph *body_graph = helper.getRuntimeGraph(node->body_graph());
+
+  return std::make_unique<kernels::While>(std::move(inputs), std::move(outputs), cond_graph,
+                                          body_graph);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/standalone/CMakeLists.txt b/compiler/luci-micro/standalone/CMakeLists.txt
index 7953359ad..d3048264d 100644
--- a/compiler/luci-micro/standalone/CMakeLists.txt
+++ b/compiler/luci-micro/standalone/CMakeLists.txt
@@ -7,6 +7,9 @@ set(BUILD_WHITELIST "dummy")
 add_subdirectory(${NNAS_ROOT}/infra/nncc ${CMAKE_CURRENT_BINARY_DIR}/nncc)
 
 set(ONE_COMPILER_SRC_DIR "${NNAS_PROJECT_SOURCE_DIR}/compiler")
+nnas_find_package(FlatBuffersSource EXACT 2.0 QUIET)
+
+include_directories(${FlatBuffersSource_DIR}/include)
 
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/loco ${CMAKE_CURRENT_BINARY_DIR}/loco)
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/angkor ${CMAKE_CURRENT_BINARY_DIR}/angkor)
@@ -14,7 +17,21 @@ add_subdirectory(${ONE_COMPILER_SRC_DIR}/oops ${CMAKE_CURRENT_BINARY_DIR}/oops)
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/pepper-str ${CMAKE_CURRENT_BINARY_DIR}/pepper-str)
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/logo ${CMAKE_CURRENT_BINARY_DIR}/logo)
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/logo-core ${CMAKE_CURRENT_BINARY_DIR}/logo-core)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/hermes-std ${CMAKE_CURRENT_BINARY_DIR}/hermes-std)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/hermes ${CMAKE_CURRENT_BINARY_DIR}/hermes)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/pepper-strcast ${CMAKE_CURRENT_BINARY_DIR}/pepper-strcast)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/foder ${CMAKE_CURRENT_BINARY_DIR}/foder)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/mio-circle04 ${CMAKE_CURRENT_BINARY_DIR}/mio-circle04)
+
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/locomotiv ${CMAKE_CURRENT_BINARY_DIR}/locomotiv)
 add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/lang ${CMAKE_CURRENT_BINARY_DIR}/luci/lang)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/import ${CMAKE_CURRENT_BINARY_DIR}/luci/import)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/profile ${CMAKE_CURRENT_BINARY_DIR}/luci/profile)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/env ${CMAKE_CURRENT_BINARY_DIR}/luci/env)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/plan ${CMAKE_CURRENT_BINARY_DIR}/luci/plan)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/log ${CMAKE_CURRENT_BINARY_DIR}/luci/log)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/logex ${CMAKE_CURRENT_BINARY_DIR}/luci/logex)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/locop ${CMAKE_CURRENT_BINARY_DIR}/locop)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/pp ${CMAKE_CURRENT_BINARY_DIR}/pp)
 
-add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci-interpreter ${CMAKE_CURRENT_BINARY_DIR}/luci-interpreter)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci-micro/luci-interpreter ${CMAKE_CURRENT_BINARY_DIR}/luci-interpreter)
diff --git a/compiler/luci-pass-value-test/CMakeLists.txt b/compiler/luci-pass-value-test/CMakeLists.txt
index 034fe5269..3489f1eac 100644
--- a/compiler/luci-pass-value-test/CMakeLists.txt
+++ b/compiler/luci-pass-value-test/CMakeLists.txt
@@ -17,6 +17,13 @@ macro(addeval RECIPE PASS_OPTION)
   set(PASS_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${PASS_CIRCLE_FILE}")
 
   set(DASH_PASS_OPTION "--${PASS_OPTION}")
+  foreach(MORE_OPTIONS ${ARGN})
+    list(APPEND DASH_PASS_OPTION "--${MORE_OPTIONS}")
+  endforeach()
+  # NOTE if there are two options, 'DASH_PASS_OPTION' will be like '--option_a;--option_b'
+  #      add_custom_command() will translate ';' to two arguments as '--optiona_a --optionb'
+  #      do not use set(DASH_PASS_OPTION "${DASH_PASS_OPTION} --${ARG}"))
+  #      as this will become like '"--optiona_a --optionb"' which is one string argument
 
   # Generate optimized .circle
   add_custom_command(OUTPUT ${PASS_CIRCLE_OUTPUT_PATH}
diff --git a/compiler/luci-pass-value-test/test.lst b/compiler/luci-pass-value-test/test.lst
index 67476c644..cdff159e0 100644
--- a/compiler/luci-pass-value-test/test.lst
+++ b/compiler/luci-pass-value-test/test.lst
@@ -14,6 +14,8 @@ addeval(Net_Conv_Add_Mul_002 fuse_batchnorm_with_conv)
 addeval(Net_Conv_Min_Max_000 transform_min_max_to_relu6)
 addeval(Net_Conv_Min_Relu_000 transform_min_relu_to_relu6)
 addeval(Net_Conv_Relu6_000 fuse_activation_function)
+addeval(Net_Densify_Add_000 fold_densify)
+addeval(Net_Dequantize_Add_000 fold_dequantize)
 addeval(Net_DwConv_BN_000 fuse_batchnorm_with_dwconv)
 addeval(Net_DwConv_BN_001 fuse_batchnorm_with_dwconv)
 addeval(Net_Reshape_Neg_000 forward_reshape_to_unaryop)
@@ -25,10 +27,17 @@ addeval(Net_TConv_Add_002 fuse_add_with_tconv)
 addeval(Net_TConv_BN_000 fuse_batchnorm_with_tconv)
 addeval(Net_TConv_BN_001 fuse_batchnorm_with_tconv)
 addeval(Net_TConv_BN_002 fuse_batchnorm_with_tconv)
+addeval(Net_TConv_BN_003 fuse_batchnorm_with_tconv)
+addeval(Net_TConv_BN_004 fuse_batchnorm_with_tconv)
 addeval(Net_InstanceNorm_001 fuse_instnorm)
 addeval(Net_InstanceNorm_002 fuse_instnorm)
 addeval(Net_InstanceNorm_003 fuse_instnorm)
 addeval(Net_StridedSlice_StridedSlice_000 remove_unnecessary_strided_slice)
+addeval(FullyConnected_007 replace_non_const_fc_with_batch_matmul)
+
+# test for limited support for FLOAT16
+addeval(Net_Dequantize_Add_000 fold_dequantize)
+addeval(Net_Densify_Dequantize_Add_000 fold_dequantize fold_densify)
 
 # test SignatureDef, with any optimization
 #addeval(SignatureDef_MultiOut_000 fuse_instnorm)
diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
index f62b72919..932da95c5 100644
--- a/compiler/luci-value-test/test.lst
+++ b/compiler/luci-value-test/test.lst
@@ -161,6 +161,8 @@ addeval(Squeeze_001)
 addeval(StridedSlice_000)
 addeval(StridedSlice_001)
 addeval(StridedSlice_002)
+addeval(StridedSlice_003)
+addeval(StridedSlice_004)
 addeval(Sub_000)
 addeval(Sub_U8_000)
 #addeval(Sum_000)
diff --git a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
index 0ff21a34b..7516197c0 100644
--- a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
+++ b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
@@ -118,6 +118,10 @@ public:
     return circle::CreateCosOptions(_builder).Union();
   }
   flatbuffers::Offset<void> visit(luci::CircleCustom *) { return _no_option; }
+  flatbuffers::Offset<void> visit(luci::CircleDensify *)
+  {
+    return circle::CreateDensifyOptions(_builder).Union();
+  }
   flatbuffers::Offset<void> visit(luci::CircleDepthToSpace *node)
   {
     return circle::CreateDepthToSpaceOptions(_builder, node->block_size()).Union();
diff --git a/compiler/luci/export/src/CircleOps.lst b/compiler/luci/export/src/CircleOps.lst
index 1b6909303..8a75ef706 100644
--- a/compiler/luci/export/src/CircleOps.lst
+++ b/compiler/luci/export/src/CircleOps.lst
@@ -32,6 +32,7 @@ CIRCLE_NODE(CircleConcatenation, BuiltinOperator_CONCATENATION, BuiltinOptions_C
 CIRCLE_NODE(CircleConv2D, BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions)
 CIRCLE_NODE(CircleCos, BuiltinOperator_COS, BuiltinOptions_CosOptions)
 CIRCLE_NODE(CircleCustom, BuiltinOperator_CUSTOM, BuiltinOptions_NONE)
+CIRCLE_NODE(CircleDensify, BuiltinOperator_DENSIFY, BuiltinOptions_DensifyOptions)
 CIRCLE_NODE(CircleDepthToSpace, BuiltinOperator_DEPTH_TO_SPACE, BuiltinOptions_DepthToSpaceOptions)
 CIRCLE_NODE(CircleDepthwiseConv2D, BuiltinOperator_DEPTHWISE_CONV_2D, BuiltinOptions_DepthwiseConv2DOptions)
 CIRCLE_NODE(CircleDequantize, BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions)
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index b3bb850cc..97e81076b 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -434,6 +434,12 @@ flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder, l
       break;
   }
 
+  // NOTE loco::DataType::FLOAT16 is added but we do not export this type
+  //      as backends currently don't support this type.
+  //      currently this is supported only for "Tensor(Float16) - Dequantize"
+  //      sequence so that after 'fold_dequantize' option this Tensor is
+  //      converted to FLOAT32.
+
   INTERNAL_EXN_V("Unsupported datatype", oops::to_uint32(c->dtype()));
 }
 
diff --git a/compiler/luci/import/CMakeLists.txt b/compiler/luci/import/CMakeLists.txt
index 1b2db23ae..bc0a00b34 100644
--- a/compiler/luci/import/CMakeLists.txt
+++ b/compiler/luci/import/CMakeLists.txt
@@ -18,6 +18,7 @@ target_link_libraries(luci_import PRIVATE luci_log)
 target_link_libraries(luci_import PRIVATE luci_logex)
 target_link_libraries(luci_import PRIVATE nncc_common)
 target_link_libraries(luci_import PRIVATE locop)
+target_link_libraries(luci_import PRIVATE foder)
 target_link_libraries(luci_import PRIVATE oops)
 target_link_libraries(luci_import PRIVATE mio_circle04_helper)
 install(TARGETS luci_import DESTINATION lib)
diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h
index 7a5045ede..a4a6d7ce8 100644
--- a/compiler/luci/import/include/luci/Import/Nodes.h
+++ b/compiler/luci/import/include/luci/Import/Nodes.h
@@ -35,6 +35,7 @@
 #include "Nodes/CircleConv2D.h"
 #include "Nodes/CircleCos.h"
 #include "Nodes/CircleCustom.h"
+#include "Nodes/CircleDensify.h"
 #include "Nodes/CircleDepthToSpace.h"
 #include "Nodes/CircleDepthwiseConv2D.h"
 #include "Nodes/CircleDequantize.h"
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleDensify.h b/compiler/luci/import/include/luci/Import/Nodes/CircleDensify.h
new file mode 100644
index 000000000..42bdac1a4
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleDensify.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_DENSIFY_H__
+#define __LUCI_IMPORT_OP_CIRCLE_DENSIFY_H__
+
+#include "luci/Import/GraphBuilder.h"
+
+namespace luci
+{
+
+class CircleDensifyGraphBuilder : public GraphBuilder
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const circle::OperatorT &op, const std::vector<CircleNode *> &inputs,
+                         loco::Graph *graph) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_DENSIFY_H__
diff --git a/compiler/luci/import/include/luci/ImporterEx.h b/compiler/luci/import/include/luci/ImporterEx.h
new file mode 100644
index 000000000..852d4c848
--- /dev/null
+++ b/compiler/luci/import/include/luci/ImporterEx.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORTER_EX_H__
+#define __LUCI_IMPORTER_EX_H__
+
+#include "luci/IR/Module.h"
+
+#include <memory>
+#include <string>
+
+namespace luci
+{
+
+class ImporterEx final
+{
+public:
+  ImporterEx() = default;
+
+public:
+  std::unique_ptr<Module> importVerifyModule(const std::string &input_path) const;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORTER_EX_H__
diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp
index fe2d830e9..d3b52aadb 100644
--- a/compiler/luci/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp
@@ -44,6 +44,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(CONCATENATION, CircleConcatenationGraphBuilder);                             // 2
   CIRCLE_NODE(CONV_2D, CircleConv2DGraphBuilder);                                          // 3
   CIRCLE_NODE(COS, CircleCosGraphBuilder);                                                 // 108
+  CIRCLE_NODE(DENSIFY, CircleDensifyGraphBuilder);                                         // 124
   CIRCLE_NODE(DEPTH_TO_SPACE, CircleDepthToSpaceGraphBuilder);                             // 5
   CIRCLE_NODE(DEPTHWISE_CONV_2D, CircleDepthwiseConv2DGraphBuilder);                       // 4
   CIRCLE_NODE(DEQUANTIZE, CircleDequantizeGraphBuilder);                                   // 6
@@ -160,7 +161,6 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   // BuiltinOperator_DELEGATE = 51,
   // BuiltinOperator_ARG_MAX = 56,
   // BuiltinOperator_HARD_SWISH = 117,
-  // BuiltinOperator_DENSIFY = 124,
 
   // Register builders for nodes which not handles in builders registered above.
 #define CIRCLE_NODE(CLASS) add(std::make_unique<CLASS>())
diff --git a/compiler/luci/import/src/ImporterEx.cpp b/compiler/luci/import/src/ImporterEx.cpp
new file mode 100644
index 000000000..db585fd4d
--- /dev/null
+++ b/compiler/luci/import/src/ImporterEx.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Importer.h"
+#include "luci/ImporterEx.h"
+
+#include <foder/FileLoader.h>
+
+#include <memory>
+#include <iostream>
+
+namespace luci
+{
+
+std::unique_ptr<Module> ImporterEx::importVerifyModule(const std::string &input_path) const
+{
+  foder::FileLoader file_loader{input_path};
+  std::vector<char> model_data;
+
+  try
+  {
+    model_data = file_loader.load();
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cerr << err.what() << std::endl;
+    return nullptr;
+  }
+
+  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
+  if (!circle::VerifyModelBuffer(verifier))
+  {
+    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
+    return nullptr;
+  }
+
+  const circle::Model *circle_model = circle::GetModel(model_data.data());
+  if (circle_model == nullptr)
+  {
+    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+    return nullptr;
+  }
+
+  Importer importer;
+  return importer.importModule(circle_model);
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleConst.cpp b/compiler/luci/import/src/Nodes/CircleConst.cpp
index a4f190dd9..88f2ae3d0 100644
--- a/compiler/luci/import/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConst.cpp
@@ -166,6 +166,10 @@ CircleNode *CircleConstNodeBuilder::build(TensorIndex tensor_index,
         copy_data<loco::DataType::FLOAT32>(buffer, num_elements, const_node);
         break;
 
+      case loco::DataType::FLOAT16:
+        copy_data<loco::DataType::FLOAT16>(buffer, num_elements, const_node);
+        break;
+
       case loco::DataType::U8:
         copy_data<loco::DataType::U8>(buffer, num_elements, const_node);
         break;
diff --git a/compiler/luci/import/src/Nodes/CircleDensify.cpp b/compiler/luci/import/src/Nodes/CircleDensify.cpp
new file mode 100644
index 000000000..0a4b2186f
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleDensify.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleDensify.h"
+
+#include <luci/IR/Nodes/CircleDensify.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleDensifyGraphBuilder::validate(const ValidateArgs &args) const
+{
+  return GraphBuilder::validate(args, 1);
+}
+
+CircleNode *CircleDensifyGraphBuilder::build_node(const circle::OperatorT &,
+                                                  const std::vector<CircleNode *> &inputs,
+                                                  loco::Graph *graph) const
+{
+  auto *node = graph->nodes()->create<CircleDensify>();
+  node->input(inputs.at(0));
+
+  // No options for Densify
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.h b/compiler/luci/lang/include/luci/IR/CircleNodes.h
index d89ea03cc..901f1cbca 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.h
@@ -32,6 +32,7 @@
 #include "Nodes/CircleConv2D.h"
 #include "Nodes/CircleCos.h"
 #include "Nodes/CircleCustom.h"
+#include "Nodes/CircleDensify.h"
 #include "Nodes/CircleDepthToSpace.h"
 #include "Nodes/CircleDepthwiseConv2D.h"
 #include "Nodes/CircleDequantize.h"
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index 1472008df..f227a03f5 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -30,6 +30,7 @@ CIRCLE_NODE(CONCATENATION, CircleConcatenation)
 CIRCLE_NODE(CONV_2D, CircleConv2D)
 CIRCLE_NODE(COS, CircleCos)
 CIRCLE_NODE(CUSTOM, CircleCustom)
+CIRCLE_NODE(DENSIFY, CircleDensify)
 CIRCLE_NODE(DEPTH_TO_SPACE, CircleDepthToSpace)
 CIRCLE_NODE(DEPTHWISE_CONV_2D, CircleDepthwiseConv2D)
 CIRCLE_NODE(DEQUANTIZE, CircleDequantize)
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDensify.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDensify.h
new file mode 100644
index 000000000..7acad0341
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDensify.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_DENSIFY_H__
+#define __LUCI_IR_CIRCLE_DENSIFY_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/CircleNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief DENSIFY in Circle
+ */
+class CircleDensify final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::DENSIFY>>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_DENSIFY_H__
diff --git a/compiler/luci/lang/src/Nodes/CircleConst.cpp b/compiler/luci/lang/src/Nodes/CircleConst.cpp
index c2d82c8a2..a4854ec59 100644
--- a/compiler/luci/lang/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleConst.cpp
@@ -77,6 +77,7 @@ INSTANTIATE(loco::DataType::S8);
 INSTANTIATE(loco::DataType::FLOAT32);
 INSTANTIATE(loco::DataType::U8);
 INSTANTIATE(loco::DataType::BOOL);
+INSTANTIATE(loco::DataType::FLOAT16);
 
 #undef INSTANTIATE
 
diff --git a/compiler/luci/lang/src/Nodes/CircleDensify.test.cpp b/compiler/luci/lang/src/Nodes/CircleDensify.test.cpp
new file mode 100644
index 000000000..ae83784a5
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleDensify.test.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleDensify.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleDensifyTest, constructor)
+{
+  luci::CircleDensify densify_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), densify_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::DENSIFY, densify_node.opcode());
+
+  ASSERT_EQ(nullptr, densify_node.input());
+}
+
+TEST(CircleDensifyTest, input_NEG)
+{
+  luci::CircleDensify densify_node;
+  luci::CircleDensify node;
+
+  densify_node.input(&node);
+  ASSERT_NE(nullptr, densify_node.input());
+
+  densify_node.input(nullptr);
+  ASSERT_EQ(nullptr, densify_node.input());
+}
+
+TEST(CircleDensifyTest, arity_NEG)
+{
+  luci::CircleDensify densify_node;
+
+  ASSERT_NO_THROW(densify_node.arg(0));
+  ASSERT_THROW(densify_node.arg(1), std::out_of_range);
+}
+
+TEST(CircleDensifyTest, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleDensify densify_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(densify_node.accept(&tv), std::exception);
+}
+
+TEST(CircleDensifyTest, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleDensify densify_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(densify_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
index eff0830b4..8409f250e 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
@@ -137,6 +137,7 @@ CircleNodeSummaryBuilder::create_builder(const luci::CircleNode *node)
     CIRCLE_NODE(CONV_2D, CircleConv2DSummaryBuilder)
     CIRCLE_NODE(COS, CircleCosSummaryBuilder)
     CIRCLE_NODE(CUSTOM, CircleCustomSummaryBuilder)
+    CIRCLE_NODE(DENSIFY, CircleDensifySummaryBuilder)
     CIRCLE_NODE(DEPTH_TO_SPACE, CircleDepthToSpaceSummaryBuilder)
     CIRCLE_NODE(DEPTHWISE_CONV_2D, CircleDepthwiseConv2DSummaryBuilder)
     CIRCLE_NODE(DEQUANTIZE, CircleDequantizeSummaryBuilder)
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
index 6df9270e3..48e4579ea 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
@@ -374,6 +374,22 @@ void CircleConcatenationSummaryBuilder::build_attributes(const luci::CircleNode
   s.args().append("fused_activation_function", to_str(concat->fusedActivationFunction()));
 }
 
+void CircleConstSummaryBuilder::build_attributes(const luci::CircleNode *node,
+                                                 locop::NodeSummary &s)
+{
+  auto circonst = loco::must_cast<const luci::CircleConst *>(node);
+  s.args().append("dtype", to_str(circonst->dtype()));
+  s.args().append("rank", std::to_string(circonst->rank()));
+  std::string shape;
+  for (uint32_t r = 0; r < circonst->rank(); ++r)
+  {
+    if (!shape.empty())
+      shape += " ";
+    shape += std::to_string(circonst->dim(r).value());
+  }
+  s.args().append("shape", "[" + shape + "]");
+}
+
 void CircleConstSummaryBuilder::update_status(locop::NodeSummary &s)
 {
   s.state(locop::NodeDesc::State::PartiallyKnown);
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
index 6cd24b7f1..f0cac4e5e 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
@@ -167,6 +167,7 @@ private:
 class CircleConstSummaryBuilder final : public CircleNodeSummaryBuilder
 {
 private:
+  void build_attributes(const luci::CircleNode *node, locop::NodeSummary &s);
   void update_status(locop::NodeSummary &s);
 };
 
@@ -189,6 +190,10 @@ private:
   void build_attributes(const luci::CircleNode *node, locop::NodeSummary &s);
 };
 
+class CircleDensifySummaryBuilder final : public CircleNodeWithINPUTSummaryBuilder
+{
+};
+
 class CircleDepthToSpaceSummaryBuilder final : public CircleNodeWithINPUTSummaryBuilder
 {
 private:
diff --git a/compiler/luci/partition/src/ConnectNode.h b/compiler/luci/partition/include/luci/ConnectNode.h
index e60567c69..2d9d41d77 100644
--- a/compiler/luci/partition/src/ConnectNode.h
+++ b/compiler/luci/partition/include/luci/ConnectNode.h
@@ -77,6 +77,7 @@ public:
   void visit(const luci::CircleConv2D *) final;
   void visit(const luci::CircleCos *) final;
   void visit(const luci::CircleCustom *) final;
+  void visit(const luci::CircleDensify *) final;
   void visit(const luci::CircleDepthToSpace *) final;
   void visit(const luci::CircleDepthwiseConv2D *) final;
   void visit(const luci::CircleDequantize *) final;
diff --git a/compiler/luci/partition/src/ConnectNode.cpp b/compiler/luci/partition/src/ConnectNode.cpp
index 336be7c57..3d8c211c0 100644
--- a/compiler/luci/partition/src/ConnectNode.cpp
+++ b/compiler/luci/partition/src/ConnectNode.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include <oops/UserExn.h>
 
diff --git a/compiler/luci/partition/src/ConnectNode.test.h b/compiler/luci/partition/src/ConnectNode.test.h
index ac4878a15..18bb52a20 100644
--- a/compiler/luci/partition/src/ConnectNode.test.h
+++ b/compiler/luci/partition/src/ConnectNode.test.h
@@ -17,7 +17,7 @@
 #ifndef __CONNECT_NODE_TEST_H__
 #define __CONNECT_NODE_TEST_H__
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include <luci/Service/CircleNodeClone.h>
 #include <luci/test/TestIOGraph.h>
diff --git a/compiler/luci/partition/src/Nodes/CircleAbs.cpp b/compiler/luci/partition/src/Nodes/CircleAbs.cpp
index a3fde4c45..a7fbc37d1 100644
--- a/compiler/luci/partition/src/Nodes/CircleAbs.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAbs.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleAbs.test.cpp b/compiler/luci/partition/src/Nodes/CircleAbs.test.cpp
index f3e721525..ac805c1af 100644
--- a/compiler/luci/partition/src/Nodes/CircleAbs.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAbs.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleAdd.cpp b/compiler/luci/partition/src/Nodes/CircleAdd.cpp
index d393997e9..0754be626 100644
--- a/compiler/luci/partition/src/Nodes/CircleAdd.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAdd.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp b/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp
index e457b83d2..99ae52c54 100644
--- a/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleAddN.cpp b/compiler/luci/partition/src/Nodes/CircleAddN.cpp
index 81e5e0949..90aaeee3a 100644
--- a/compiler/luci/partition/src/Nodes/CircleAddN.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAddN.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleAddN.test.cpp b/compiler/luci/partition/src/Nodes/CircleAddN.test.cpp
index 5d0a7489f..37743d3a3 100644
--- a/compiler/luci/partition/src/Nodes/CircleAddN.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAddN.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleArgMax.cpp b/compiler/luci/partition/src/Nodes/CircleArgMax.cpp
index 1409586d7..99b30d38f 100644
--- a/compiler/luci/partition/src/Nodes/CircleArgMax.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleArgMax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleArgMax.test.cpp b/compiler/luci/partition/src/Nodes/CircleArgMax.test.cpp
index c816fbeb8..77248e07e 100644
--- a/compiler/luci/partition/src/Nodes/CircleArgMax.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleArgMax.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleArgMin.cpp b/compiler/luci/partition/src/Nodes/CircleArgMin.cpp
index 6151aa98a..1bb3d84e7 100644
--- a/compiler/luci/partition/src/Nodes/CircleArgMin.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleArgMin.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleArgMin.test.cpp b/compiler/luci/partition/src/Nodes/CircleArgMin.test.cpp
index d150be4d6..ed0cf030c 100644
--- a/compiler/luci/partition/src/Nodes/CircleArgMin.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleArgMin.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleAveragePool2D.cpp b/compiler/luci/partition/src/Nodes/CircleAveragePool2D.cpp
index 547665771..1df86c7be 100644
--- a/compiler/luci/partition/src/Nodes/CircleAveragePool2D.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAveragePool2D.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleAveragePool2D.test.cpp b/compiler/luci/partition/src/Nodes/CircleAveragePool2D.test.cpp
index fba2be835..266120b92 100644
--- a/compiler/luci/partition/src/Nodes/CircleAveragePool2D.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleAveragePool2D.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.cpp b/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.cpp
index 5b1dd8543..6d50f0e31 100644
--- a/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.test.cpp b/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.test.cpp
index 3d64f4b29..2191f5b0a 100644
--- a/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBCQFullyConnected.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleBCQGather.cpp b/compiler/luci/partition/src/Nodes/CircleBCQGather.cpp
index 90c4d9ef3..a9e810a27 100644
--- a/compiler/luci/partition/src/Nodes/CircleBCQGather.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBCQGather.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleBCQGather.test.cpp b/compiler/luci/partition/src/Nodes/CircleBCQGather.test.cpp
index bbbd3f157..0324d85e0 100644
--- a/compiler/luci/partition/src/Nodes/CircleBCQGather.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBCQGather.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleBatchMatMul.cpp b/compiler/luci/partition/src/Nodes/CircleBatchMatMul.cpp
index c3992a64e..5a459e78c 100644
--- a/compiler/luci/partition/src/Nodes/CircleBatchMatMul.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBatchMatMul.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleBatchMatMul.test.cpp b/compiler/luci/partition/src/Nodes/CircleBatchMatMul.test.cpp
index 94336d36a..e6d26a6a1 100644
--- a/compiler/luci/partition/src/Nodes/CircleBatchMatMul.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBatchMatMul.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.cpp b/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.cpp
index 2a463afb1..40b8f7052 100644
--- a/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.test.cpp b/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.test.cpp
index 544f5e127..e9cb350b8 100644
--- a/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleBatchToSpaceND.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleCast.cpp b/compiler/luci/partition/src/Nodes/CircleCast.cpp
index f7630cd85..e1301aa06 100644
--- a/compiler/luci/partition/src/Nodes/CircleCast.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCast.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleCast.test.cpp b/compiler/luci/partition/src/Nodes/CircleCast.test.cpp
index 005119060..d7b679aa2 100644
--- a/compiler/luci/partition/src/Nodes/CircleCast.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCast.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleCeil.cpp b/compiler/luci/partition/src/Nodes/CircleCeil.cpp
index a0c94033e..e7b5f5a3f 100644
--- a/compiler/luci/partition/src/Nodes/CircleCeil.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCeil.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleCeil.test.cpp b/compiler/luci/partition/src/Nodes/CircleCeil.test.cpp
index dbd7e5390..cb0364844 100644
--- a/compiler/luci/partition/src/Nodes/CircleCeil.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCeil.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleConcatenation.cpp b/compiler/luci/partition/src/Nodes/CircleConcatenation.cpp
index fb24d21ca..d895685f0 100644
--- a/compiler/luci/partition/src/Nodes/CircleConcatenation.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConcatenation.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleConcatenation.test.cpp b/compiler/luci/partition/src/Nodes/CircleConcatenation.test.cpp
index 4d64b85a2..b5c05e25d 100644
--- a/compiler/luci/partition/src/Nodes/CircleConcatenation.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConcatenation.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleConst.cpp b/compiler/luci/partition/src/Nodes/CircleConst.cpp
index 118cd8de2..b88f5ef4e 100644
--- a/compiler/luci/partition/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConst.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace luci
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleConv2D.cpp b/compiler/luci/partition/src/Nodes/CircleConv2D.cpp
index 46716f0ec..ca9cce18f 100644
--- a/compiler/luci/partition/src/Nodes/CircleConv2D.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConv2D.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleConv2D.test.cpp b/compiler/luci/partition/src/Nodes/CircleConv2D.test.cpp
index 829adec9b..4596d9618 100644
--- a/compiler/luci/partition/src/Nodes/CircleConv2D.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConv2D.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleCos.cpp b/compiler/luci/partition/src/Nodes/CircleCos.cpp
index 9dcf81e83..76b1baac3 100644
--- a/compiler/luci/partition/src/Nodes/CircleCos.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCos.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleCos.test.cpp b/compiler/luci/partition/src/Nodes/CircleCos.test.cpp
index 6c92b93fb..ba806a3f9 100644
--- a/compiler/luci/partition/src/Nodes/CircleCos.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCos.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleCustom.cpp b/compiler/luci/partition/src/Nodes/CircleCustom.cpp
index ac16ebe40..cc1604876 100644
--- a/compiler/luci/partition/src/Nodes/CircleCustom.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCustom.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleCustom.test.cpp b/compiler/luci/partition/src/Nodes/CircleCustom.test.cpp
index 9f40b5220..f7fe86674 100644
--- a/compiler/luci/partition/src/Nodes/CircleCustom.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCustom.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleCustomOut.cpp b/compiler/luci/partition/src/Nodes/CircleCustomOut.cpp
index fee1a1a8c..0d83cffaa 100644
--- a/compiler/luci/partition/src/Nodes/CircleCustomOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCustomOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleCustomOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleCustomOut.test.cpp
index 0a293970e..ddd4e93f2 100644
--- a/compiler/luci/partition/src/Nodes/CircleCustomOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleCustomOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleDensify.cpp b/compiler/luci/partition/src/Nodes/CircleDensify.cpp
new file mode 100644
index 000000000..cfb236a5d
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleDensify.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleDensify *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleDensify *>(cn->find_clone(node));
+
+  luci::CircleNode *input = loco::must_cast<luci::CircleNode *>(node->input());
+
+  cloned->input(cn->find_clone(input));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleDensify *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleDensify.test.cpp b/compiler/luci/partition/src/Nodes/CircleDensify.test.cpp
new file mode 100644
index 000000000..94076a8db
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleDensify.test.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleDensify>
+{
+public:
+  NodeGraphlet() = default;
+};
+
+class TestNodeGraph : public TestIOGraph, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    NodeGraphlet::init(g());
+
+    node()->input(input());
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_Densify)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDensify *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDensify *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(1, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+}
+
+TEST(ConnectNodeTest, connect_Densify_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDensify *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDensify *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/partition/src/Nodes/CircleDepthToSpace.cpp b/compiler/luci/partition/src/Nodes/CircleDepthToSpace.cpp
index ade266e41..c044b4c42 100644
--- a/compiler/luci/partition/src/Nodes/CircleDepthToSpace.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDepthToSpace.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleDepthToSpace.test.cpp b/compiler/luci/partition/src/Nodes/CircleDepthToSpace.test.cpp
index 997360a9b..1b61a3517 100644
--- a/compiler/luci/partition/src/Nodes/CircleDepthToSpace.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDepthToSpace.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.cpp b/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.cpp
index 19d1d5f42..2bd9ab5ca 100644
--- a/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.test.cpp b/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.test.cpp
index 681f98bdb..02976a488 100644
--- a/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDepthwiseConv2D.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleDequantize.cpp b/compiler/luci/partition/src/Nodes/CircleDequantize.cpp
index 3a520d4e9..ac2642bc1 100644
--- a/compiler/luci/partition/src/Nodes/CircleDequantize.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDequantize.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleDequantize.test.cpp b/compiler/luci/partition/src/Nodes/CircleDequantize.test.cpp
index 7f6006c1d..d3a43d374 100644
--- a/compiler/luci/partition/src/Nodes/CircleDequantize.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDequantize.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleDiv.cpp b/compiler/luci/partition/src/Nodes/CircleDiv.cpp
index 480338542..8941a4196 100644
--- a/compiler/luci/partition/src/Nodes/CircleDiv.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDiv.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp b/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp
index 226932337..7900beafc 100644
--- a/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleElu.cpp b/compiler/luci/partition/src/Nodes/CircleElu.cpp
index d21cd4c01..b77226574 100644
--- a/compiler/luci/partition/src/Nodes/CircleElu.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleElu.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleElu.test.cpp b/compiler/luci/partition/src/Nodes/CircleElu.test.cpp
index 94774cca8..20b205048 100644
--- a/compiler/luci/partition/src/Nodes/CircleElu.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleElu.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleEqual.cpp b/compiler/luci/partition/src/Nodes/CircleEqual.cpp
index 6a126c0e2..2dc0e759b 100644
--- a/compiler/luci/partition/src/Nodes/CircleEqual.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleEqual.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleEqual.test.cpp b/compiler/luci/partition/src/Nodes/CircleEqual.test.cpp
index 20b539199..c0d3bd915 100644
--- a/compiler/luci/partition/src/Nodes/CircleEqual.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleEqual.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleExp.cpp b/compiler/luci/partition/src/Nodes/CircleExp.cpp
index 95fb1cd67..c1da7908a 100644
--- a/compiler/luci/partition/src/Nodes/CircleExp.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleExp.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleExp.test.cpp b/compiler/luci/partition/src/Nodes/CircleExp.test.cpp
index 16d7244ab..286f205bf 100644
--- a/compiler/luci/partition/src/Nodes/CircleExp.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleExp.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleExpandDims.cpp b/compiler/luci/partition/src/Nodes/CircleExpandDims.cpp
index 6fccd6310..a6ce6495c 100644
--- a/compiler/luci/partition/src/Nodes/CircleExpandDims.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleExpandDims.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleExpandDims.test.cpp b/compiler/luci/partition/src/Nodes/CircleExpandDims.test.cpp
index 8a5156509..37af10f52 100644
--- a/compiler/luci/partition/src/Nodes/CircleExpandDims.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleExpandDims.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFakeQuant.cpp b/compiler/luci/partition/src/Nodes/CircleFakeQuant.cpp
index 4855d80ae..5dfaee1b5 100644
--- a/compiler/luci/partition/src/Nodes/CircleFakeQuant.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFakeQuant.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFakeQuant.test.cpp b/compiler/luci/partition/src/Nodes/CircleFakeQuant.test.cpp
index 3821d755a..2a2ec0cff 100644
--- a/compiler/luci/partition/src/Nodes/CircleFakeQuant.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFakeQuant.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFill.cpp b/compiler/luci/partition/src/Nodes/CircleFill.cpp
index 06fca7b41..32688cd9b 100644
--- a/compiler/luci/partition/src/Nodes/CircleFill.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFill.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFill.test.cpp b/compiler/luci/partition/src/Nodes/CircleFill.test.cpp
index 97a5a348d..4b3872a80 100644
--- a/compiler/luci/partition/src/Nodes/CircleFill.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFill.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFloor.cpp b/compiler/luci/partition/src/Nodes/CircleFloor.cpp
index 7ad392461..f7409a221 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloor.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloor.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFloor.test.cpp b/compiler/luci/partition/src/Nodes/CircleFloor.test.cpp
index 1a964ea21..883d36256 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloor.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloor.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFloorDiv.cpp b/compiler/luci/partition/src/Nodes/CircleFloorDiv.cpp
index 3b92b00c6..57e435c23 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloorDiv.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloorDiv.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFloorDiv.test.cpp b/compiler/luci/partition/src/Nodes/CircleFloorDiv.test.cpp
index 3d2801566..1eb603c5d 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloorDiv.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloorDiv.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFloorMod.cpp b/compiler/luci/partition/src/Nodes/CircleFloorMod.cpp
index 9f868d0e5..1b942d200 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloorMod.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloorMod.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFloorMod.test.cpp b/compiler/luci/partition/src/Nodes/CircleFloorMod.test.cpp
index 89a09411b..680bf1680 100644
--- a/compiler/luci/partition/src/Nodes/CircleFloorMod.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFloorMod.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/partition/src/Nodes/CircleFullyConnected.cpp
index da273037a..206b47aec 100644
--- a/compiler/luci/partition/src/Nodes/CircleFullyConnected.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFullyConnected.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleFullyConnected.test.cpp b/compiler/luci/partition/src/Nodes/CircleFullyConnected.test.cpp
index fc88204bd..39eea5571 100644
--- a/compiler/luci/partition/src/Nodes/CircleFullyConnected.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleFullyConnected.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleGather.cpp b/compiler/luci/partition/src/Nodes/CircleGather.cpp
index 0ee458394..4f059cbe4 100644
--- a/compiler/luci/partition/src/Nodes/CircleGather.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGather.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleGather.test.cpp b/compiler/luci/partition/src/Nodes/CircleGather.test.cpp
index 7f4e08435..f427e0456 100644
--- a/compiler/luci/partition/src/Nodes/CircleGather.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGather.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleGatherNd.cpp b/compiler/luci/partition/src/Nodes/CircleGatherNd.cpp
index 4be05ca94..6a9c3b47f 100644
--- a/compiler/luci/partition/src/Nodes/CircleGatherNd.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGatherNd.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleGatherNd.test.cpp b/compiler/luci/partition/src/Nodes/CircleGatherNd.test.cpp
index d673698e1..0207e917d 100644
--- a/compiler/luci/partition/src/Nodes/CircleGatherNd.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGatherNd.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleGreater.cpp b/compiler/luci/partition/src/Nodes/CircleGreater.cpp
index 7bc2a14c9..9f4b18fde 100644
--- a/compiler/luci/partition/src/Nodes/CircleGreater.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGreater.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleGreater.test.cpp b/compiler/luci/partition/src/Nodes/CircleGreater.test.cpp
index 842370d42..61d1f5957 100644
--- a/compiler/luci/partition/src/Nodes/CircleGreater.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGreater.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleGreaterEqual.cpp b/compiler/luci/partition/src/Nodes/CircleGreaterEqual.cpp
index 536a0aed6..76130a843 100644
--- a/compiler/luci/partition/src/Nodes/CircleGreaterEqual.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGreaterEqual.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleGreaterEqual.test.cpp b/compiler/luci/partition/src/Nodes/CircleGreaterEqual.test.cpp
index 76dc770f8..7e4e1ef74 100644
--- a/compiler/luci/partition/src/Nodes/CircleGreaterEqual.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleGreaterEqual.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleIf.cpp b/compiler/luci/partition/src/Nodes/CircleIf.cpp
index 1672a136d..45e4ec48b 100644
--- a/compiler/luci/partition/src/Nodes/CircleIf.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleIf.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleIf.test.cpp b/compiler/luci/partition/src/Nodes/CircleIf.test.cpp
index dbd25c822..cbb766221 100644
--- a/compiler/luci/partition/src/Nodes/CircleIf.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleIf.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleIfOut.cpp b/compiler/luci/partition/src/Nodes/CircleIfOut.cpp
index 969bdd93c..2eb5dda1f 100644
--- a/compiler/luci/partition/src/Nodes/CircleIfOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleIfOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleIfOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleIfOut.test.cpp
index 9207654bc..ec2dde3b2 100644
--- a/compiler/luci/partition/src/Nodes/CircleIfOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleIfOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleInstanceNorm.cpp b/compiler/luci/partition/src/Nodes/CircleInstanceNorm.cpp
index 386652fb1..f64ffd8b4 100644
--- a/compiler/luci/partition/src/Nodes/CircleInstanceNorm.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleInstanceNorm.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleInstanceNorm.test.cpp b/compiler/luci/partition/src/Nodes/CircleInstanceNorm.test.cpp
index b932223d0..4363c6c18 100644
--- a/compiler/luci/partition/src/Nodes/CircleInstanceNorm.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleInstanceNorm.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleL2Normalize.cpp b/compiler/luci/partition/src/Nodes/CircleL2Normalize.cpp
index 61ddba264..df26930ec 100644
--- a/compiler/luci/partition/src/Nodes/CircleL2Normalize.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleL2Normalize.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleL2Normalize.test.cpp b/compiler/luci/partition/src/Nodes/CircleL2Normalize.test.cpp
index 4fc23727a..b114a15f0 100644
--- a/compiler/luci/partition/src/Nodes/CircleL2Normalize.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleL2Normalize.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleL2Pool2D.cpp b/compiler/luci/partition/src/Nodes/CircleL2Pool2D.cpp
index 24333d507..1eacddb62 100644
--- a/compiler/luci/partition/src/Nodes/CircleL2Pool2D.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleL2Pool2D.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleL2Pool2D.test.cpp b/compiler/luci/partition/src/Nodes/CircleL2Pool2D.test.cpp
index 40328488c..22f99d5ef 100644
--- a/compiler/luci/partition/src/Nodes/CircleL2Pool2D.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleL2Pool2D.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLeakyRelu.cpp b/compiler/luci/partition/src/Nodes/CircleLeakyRelu.cpp
index 3da1ba287..1702ddeb1 100644
--- a/compiler/luci/partition/src/Nodes/CircleLeakyRelu.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLeakyRelu.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLeakyRelu.test.cpp b/compiler/luci/partition/src/Nodes/CircleLeakyRelu.test.cpp
index 5a0d1dd87..71dc55ea0 100644
--- a/compiler/luci/partition/src/Nodes/CircleLeakyRelu.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLeakyRelu.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLess.cpp b/compiler/luci/partition/src/Nodes/CircleLess.cpp
index aab495fcc..52726f9be 100644
--- a/compiler/luci/partition/src/Nodes/CircleLess.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLess.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLess.test.cpp b/compiler/luci/partition/src/Nodes/CircleLess.test.cpp
index ab65e5d18..c5d194efe 100644
--- a/compiler/luci/partition/src/Nodes/CircleLess.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLess.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLessEqual.cpp b/compiler/luci/partition/src/Nodes/CircleLessEqual.cpp
index ec129dbe8..e9a3c412b 100644
--- a/compiler/luci/partition/src/Nodes/CircleLessEqual.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLessEqual.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLessEqual.test.cpp b/compiler/luci/partition/src/Nodes/CircleLessEqual.test.cpp
index 0dd8986b6..29f4ababa 100644
--- a/compiler/luci/partition/src/Nodes/CircleLessEqual.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLessEqual.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.cpp b/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.cpp
index 6b0d1cd12..7a00bf94f 100644
--- a/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.test.cpp b/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.test.cpp
index e1973387d..5e5723817 100644
--- a/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLocalResponseNormalization.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLog.cpp b/compiler/luci/partition/src/Nodes/CircleLog.cpp
index c43570fa2..676d22fc0 100644
--- a/compiler/luci/partition/src/Nodes/CircleLog.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLog.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLog.test.cpp b/compiler/luci/partition/src/Nodes/CircleLog.test.cpp
index 8a43f6f01..0a2b97538 100644
--- a/compiler/luci/partition/src/Nodes/CircleLog.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLog.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLogSoftmax.cpp b/compiler/luci/partition/src/Nodes/CircleLogSoftmax.cpp
index de582c80d..c67b08f0f 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogSoftmax.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogSoftmax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLogSoftmax.test.cpp b/compiler/luci/partition/src/Nodes/CircleLogSoftmax.test.cpp
index 1e60bf54c..b6daeb781 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogSoftmax.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogSoftmax.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalAnd.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalAnd.cpp
index 28e8f42e5..1498d85ec 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalAnd.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalAnd.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalAnd.test.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalAnd.test.cpp
index a1189f06f..0b9513626 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalAnd.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalAnd.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalNot.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalNot.cpp
index e2657824c..f9c077e4e 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalNot.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalNot.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalNot.test.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalNot.test.cpp
index f6b34596e..88dff3651 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalNot.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalNot.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalOr.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalOr.cpp
index 418dc023b..59592e41d 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalOr.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalOr.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLogicalOr.test.cpp b/compiler/luci/partition/src/Nodes/CircleLogicalOr.test.cpp
index fee3f4779..35f8029c0 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogicalOr.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogicalOr.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleLogistic.cpp b/compiler/luci/partition/src/Nodes/CircleLogistic.cpp
index 7d788512d..804597bed 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogistic.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogistic.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleLogistic.test.cpp b/compiler/luci/partition/src/Nodes/CircleLogistic.test.cpp
index c4b3f7fe3..241d84040 100644
--- a/compiler/luci/partition/src/Nodes/CircleLogistic.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleLogistic.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMatrixDiag.cpp b/compiler/luci/partition/src/Nodes/CircleMatrixDiag.cpp
index e92806aff..297e9f2cc 100644
--- a/compiler/luci/partition/src/Nodes/CircleMatrixDiag.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMatrixDiag.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMatrixDiag.test.cpp b/compiler/luci/partition/src/Nodes/CircleMatrixDiag.test.cpp
index 03e3c3c3e..472cab8c8 100644
--- a/compiler/luci/partition/src/Nodes/CircleMatrixDiag.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMatrixDiag.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.cpp b/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.cpp
index 29bb7fe5f..b327aacad 100644
--- a/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.test.cpp b/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.test.cpp
index 5503ea18f..4ff797c43 100644
--- a/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMatrixSetDiag.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMaxPool2D.cpp b/compiler/luci/partition/src/Nodes/CircleMaxPool2D.cpp
index 75a665aee..dee90e5c0 100644
--- a/compiler/luci/partition/src/Nodes/CircleMaxPool2D.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMaxPool2D.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMaxPool2D.test.cpp b/compiler/luci/partition/src/Nodes/CircleMaxPool2D.test.cpp
index 16996497a..949e0d724 100644
--- a/compiler/luci/partition/src/Nodes/CircleMaxPool2D.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMaxPool2D.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMaximum.cpp b/compiler/luci/partition/src/Nodes/CircleMaximum.cpp
index 2ba6055b4..459917e3e 100644
--- a/compiler/luci/partition/src/Nodes/CircleMaximum.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMaximum.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMaximum.test.cpp b/compiler/luci/partition/src/Nodes/CircleMaximum.test.cpp
index 370174c37..e6a6d5741 100644
--- a/compiler/luci/partition/src/Nodes/CircleMaximum.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMaximum.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMean.cpp b/compiler/luci/partition/src/Nodes/CircleMean.cpp
index b634e5838..c704d0054 100644
--- a/compiler/luci/partition/src/Nodes/CircleMean.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMean.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMean.test.cpp b/compiler/luci/partition/src/Nodes/CircleMean.test.cpp
index 53435d9dc..838d7aea2 100644
--- a/compiler/luci/partition/src/Nodes/CircleMean.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMean.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMinimum.cpp b/compiler/luci/partition/src/Nodes/CircleMinimum.cpp
index cdf757583..8958bf64a 100644
--- a/compiler/luci/partition/src/Nodes/CircleMinimum.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMinimum.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMinimum.test.cpp b/compiler/luci/partition/src/Nodes/CircleMinimum.test.cpp
index 2fe6b0da6..a6c86a27a 100644
--- a/compiler/luci/partition/src/Nodes/CircleMinimum.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMinimum.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMirrorPad.cpp b/compiler/luci/partition/src/Nodes/CircleMirrorPad.cpp
index 16a24abf7..91c3cb97a 100644
--- a/compiler/luci/partition/src/Nodes/CircleMirrorPad.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMirrorPad.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMirrorPad.test.cpp b/compiler/luci/partition/src/Nodes/CircleMirrorPad.test.cpp
index 605a126c9..b837e1012 100644
--- a/compiler/luci/partition/src/Nodes/CircleMirrorPad.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMirrorPad.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleMul.cpp b/compiler/luci/partition/src/Nodes/CircleMul.cpp
index 2cd2b4038..12e14728c 100644
--- a/compiler/luci/partition/src/Nodes/CircleMul.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMul.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleMul.test.cpp b/compiler/luci/partition/src/Nodes/CircleMul.test.cpp
index 99cf0824d..b316679f8 100644
--- a/compiler/luci/partition/src/Nodes/CircleMul.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleMul.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNeg.cpp b/compiler/luci/partition/src/Nodes/CircleNeg.cpp
index 413ad4930..e9dcc45cd 100644
--- a/compiler/luci/partition/src/Nodes/CircleNeg.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNeg.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNeg.test.cpp b/compiler/luci/partition/src/Nodes/CircleNeg.test.cpp
index bd74a3665..ab13c9416 100644
--- a/compiler/luci/partition/src/Nodes/CircleNeg.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNeg.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.cpp
index 63ff3f021..88d72e12f 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.test.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
index 2771aef49..e796a14c3 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.cpp
index 80e4704b9..61caa3a4c 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
index 5a0a8da8c..eb04f2688 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.cpp
index c1f117724..3b0b755a4 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.test.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.test.cpp
index 1f20fbb0f..c9c31b315 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.cpp
index 69e3cc8e8..3eed260c2 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp
index e001b0b0b..2c5822fe3 100644
--- a/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleNotEqual.cpp b/compiler/luci/partition/src/Nodes/CircleNotEqual.cpp
index c40c2a21a..29a6a43bb 100644
--- a/compiler/luci/partition/src/Nodes/CircleNotEqual.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNotEqual.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleNotEqual.test.cpp b/compiler/luci/partition/src/Nodes/CircleNotEqual.test.cpp
index 360940ca7..2983e1b27 100644
--- a/compiler/luci/partition/src/Nodes/CircleNotEqual.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleNotEqual.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleOneHot.cpp b/compiler/luci/partition/src/Nodes/CircleOneHot.cpp
index d76f49255..d172fb834 100644
--- a/compiler/luci/partition/src/Nodes/CircleOneHot.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleOneHot.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleOneHot.test.cpp b/compiler/luci/partition/src/Nodes/CircleOneHot.test.cpp
index 3c555c290..59780e424 100644
--- a/compiler/luci/partition/src/Nodes/CircleOneHot.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleOneHot.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleOutputDummy.cpp b/compiler/luci/partition/src/Nodes/CircleOutputDummy.cpp
index a033e80a8..61d7620aa 100644
--- a/compiler/luci/partition/src/Nodes/CircleOutputDummy.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleOutputDummy.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace luci
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleOutputExclude.cpp b/compiler/luci/partition/src/Nodes/CircleOutputExclude.cpp
index 106eb405d..36ce35077 100644
--- a/compiler/luci/partition/src/Nodes/CircleOutputExclude.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleOutputExclude.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace luci
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePRelu.cpp b/compiler/luci/partition/src/Nodes/CirclePRelu.cpp
index b8a2341c8..6a2325715 100644
--- a/compiler/luci/partition/src/Nodes/CirclePRelu.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePRelu.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePRelu.test.cpp b/compiler/luci/partition/src/Nodes/CirclePRelu.test.cpp
index e5bcedcf6..f2a2e2c7d 100644
--- a/compiler/luci/partition/src/Nodes/CirclePRelu.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePRelu.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CirclePack.cpp b/compiler/luci/partition/src/Nodes/CirclePack.cpp
index 326881067..d4b49bfa9 100644
--- a/compiler/luci/partition/src/Nodes/CirclePack.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePack.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePack.test.cpp b/compiler/luci/partition/src/Nodes/CirclePack.test.cpp
index 68c513848..665b137e8 100644
--- a/compiler/luci/partition/src/Nodes/CirclePack.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePack.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CirclePad.cpp b/compiler/luci/partition/src/Nodes/CirclePad.cpp
index eb2a89c85..0a1d6f7f9 100644
--- a/compiler/luci/partition/src/Nodes/CirclePad.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePad.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePad.test.cpp b/compiler/luci/partition/src/Nodes/CirclePad.test.cpp
index 24ea83fa3..72f97d6a4 100644
--- a/compiler/luci/partition/src/Nodes/CirclePad.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePad.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CirclePadV2.cpp b/compiler/luci/partition/src/Nodes/CirclePadV2.cpp
index 001fecbcb..969cc271d 100644
--- a/compiler/luci/partition/src/Nodes/CirclePadV2.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePadV2.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePadV2.test.cpp b/compiler/luci/partition/src/Nodes/CirclePadV2.test.cpp
index aea8e0cce..9829f6269 100644
--- a/compiler/luci/partition/src/Nodes/CirclePadV2.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePadV2.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CirclePow.cpp b/compiler/luci/partition/src/Nodes/CirclePow.cpp
index fb180ee69..ce69e7402 100644
--- a/compiler/luci/partition/src/Nodes/CirclePow.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePow.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CirclePow.test.cpp b/compiler/luci/partition/src/Nodes/CirclePow.test.cpp
index 7a5be4d13..f4e49c023 100644
--- a/compiler/luci/partition/src/Nodes/CirclePow.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CirclePow.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleQuantize.cpp b/compiler/luci/partition/src/Nodes/CircleQuantize.cpp
index 340c1da42..903a94e32 100644
--- a/compiler/luci/partition/src/Nodes/CircleQuantize.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleQuantize.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleQuantize.test.cpp b/compiler/luci/partition/src/Nodes/CircleQuantize.test.cpp
index 1f348b45c..5ca1a6baa 100644
--- a/compiler/luci/partition/src/Nodes/CircleQuantize.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleQuantize.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRange.cpp b/compiler/luci/partition/src/Nodes/CircleRange.cpp
index f295338d8..fa1a02c71 100644
--- a/compiler/luci/partition/src/Nodes/CircleRange.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRange.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRange.test.cpp b/compiler/luci/partition/src/Nodes/CircleRange.test.cpp
index 59a95f119..b5b0c8aa8 100644
--- a/compiler/luci/partition/src/Nodes/CircleRange.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRange.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRank.cpp b/compiler/luci/partition/src/Nodes/CircleRank.cpp
index f7cce762b..35b4764aa 100644
--- a/compiler/luci/partition/src/Nodes/CircleRank.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRank.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRank.test.cpp b/compiler/luci/partition/src/Nodes/CircleRank.test.cpp
index 74c520bee..5a0a71a7e 100644
--- a/compiler/luci/partition/src/Nodes/CircleRank.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRank.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceAny.cpp b/compiler/luci/partition/src/Nodes/CircleReduceAny.cpp
index ed762dbc6..262e12ac1 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceAny.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceAny.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceAny.test.cpp b/compiler/luci/partition/src/Nodes/CircleReduceAny.test.cpp
index 792f51187..45c292073 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceAny.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceAny.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceMax.cpp b/compiler/luci/partition/src/Nodes/CircleReduceMax.cpp
index 09586ecee..d91c78e41 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceMax.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceMax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceMax.test.cpp b/compiler/luci/partition/src/Nodes/CircleReduceMax.test.cpp
index 8fbaf653e..2ad18f339 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceMax.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceMax.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceMin.cpp b/compiler/luci/partition/src/Nodes/CircleReduceMin.cpp
index 105214d0b..65fca6ab3 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceMin.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceMin.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceMin.test.cpp b/compiler/luci/partition/src/Nodes/CircleReduceMin.test.cpp
index c37d6248f..db48f54d7 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceMin.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceMin.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceProd.cpp b/compiler/luci/partition/src/Nodes/CircleReduceProd.cpp
index 2fb4e3e01..daac168b2 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceProd.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceProd.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReduceProd.test.cpp b/compiler/luci/partition/src/Nodes/CircleReduceProd.test.cpp
index cc1ac83ad..f5f69f0ff 100644
--- a/compiler/luci/partition/src/Nodes/CircleReduceProd.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReduceProd.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRelu.cpp b/compiler/luci/partition/src/Nodes/CircleRelu.cpp
index d3617bdbd..63ac31ba9 100644
--- a/compiler/luci/partition/src/Nodes/CircleRelu.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRelu.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRelu.test.cpp b/compiler/luci/partition/src/Nodes/CircleRelu.test.cpp
index ccaf5760b..ec4d10f09 100644
--- a/compiler/luci/partition/src/Nodes/CircleRelu.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRelu.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRelu6.cpp b/compiler/luci/partition/src/Nodes/CircleRelu6.cpp
index fb9ba6f36..c2956c456 100644
--- a/compiler/luci/partition/src/Nodes/CircleRelu6.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRelu6.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRelu6.test.cpp b/compiler/luci/partition/src/Nodes/CircleRelu6.test.cpp
index 1341b0e06..e9ecbe2e6 100644
--- a/compiler/luci/partition/src/Nodes/CircleRelu6.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRelu6.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReluN1To1.cpp b/compiler/luci/partition/src/Nodes/CircleReluN1To1.cpp
index 476195b71..1141297da 100644
--- a/compiler/luci/partition/src/Nodes/CircleReluN1To1.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReluN1To1.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReluN1To1.test.cpp b/compiler/luci/partition/src/Nodes/CircleReluN1To1.test.cpp
index 7dc63c6ef..ae60a97e5 100644
--- a/compiler/luci/partition/src/Nodes/CircleReluN1To1.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReluN1To1.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReshape.cpp b/compiler/luci/partition/src/Nodes/CircleReshape.cpp
index e59670453..49f7c64a7 100644
--- a/compiler/luci/partition/src/Nodes/CircleReshape.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReshape.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReshape.test.cpp b/compiler/luci/partition/src/Nodes/CircleReshape.test.cpp
index 73cbbdfcc..198cfa1b6 100644
--- a/compiler/luci/partition/src/Nodes/CircleReshape.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReshape.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleResizeBilinear.cpp b/compiler/luci/partition/src/Nodes/CircleResizeBilinear.cpp
index 0f504015b..41fdedf2a 100644
--- a/compiler/luci/partition/src/Nodes/CircleResizeBilinear.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleResizeBilinear.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleResizeBilinear.test.cpp b/compiler/luci/partition/src/Nodes/CircleResizeBilinear.test.cpp
index c2d8b714b..437e448a6 100644
--- a/compiler/luci/partition/src/Nodes/CircleResizeBilinear.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleResizeBilinear.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.cpp b/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.cpp
index c985b7f51..567db4961 100644
--- a/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.test.cpp b/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.test.cpp
index 9cc2e558e..5dc99a385 100644
--- a/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleResizeNearestNeighbor.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReverseSequence.cpp b/compiler/luci/partition/src/Nodes/CircleReverseSequence.cpp
index 225d29ea5..348cdbb78 100644
--- a/compiler/luci/partition/src/Nodes/CircleReverseSequence.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReverseSequence.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReverseSequence.test.cpp b/compiler/luci/partition/src/Nodes/CircleReverseSequence.test.cpp
index 408fc0c9c..751910326 100644
--- a/compiler/luci/partition/src/Nodes/CircleReverseSequence.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReverseSequence.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleReverseV2.cpp b/compiler/luci/partition/src/Nodes/CircleReverseV2.cpp
index d59a7de93..4b8c4a444 100644
--- a/compiler/luci/partition/src/Nodes/CircleReverseV2.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReverseV2.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleReverseV2.test.cpp b/compiler/luci/partition/src/Nodes/CircleReverseV2.test.cpp
index d41ad8e66..351c6f2c0 100644
--- a/compiler/luci/partition/src/Nodes/CircleReverseV2.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleReverseV2.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRound.cpp b/compiler/luci/partition/src/Nodes/CircleRound.cpp
index 9170bcdd9..97d002870 100644
--- a/compiler/luci/partition/src/Nodes/CircleRound.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRound.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRound.test.cpp b/compiler/luci/partition/src/Nodes/CircleRound.test.cpp
index fad090476..02f335dc3 100644
--- a/compiler/luci/partition/src/Nodes/CircleRound.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRound.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp b/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp
index 03e64aad0..44abd5ef7 100644
--- a/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleRsqrt.test.cpp b/compiler/luci/partition/src/Nodes/CircleRsqrt.test.cpp
index d76b96e14..39ae1f8f3 100644
--- a/compiler/luci/partition/src/Nodes/CircleRsqrt.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleRsqrt.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSVDF.cpp b/compiler/luci/partition/src/Nodes/CircleSVDF.cpp
index f661a794c..e2b99c49d 100644
--- a/compiler/luci/partition/src/Nodes/CircleSVDF.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSVDF.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSVDF.test.cpp b/compiler/luci/partition/src/Nodes/CircleSVDF.test.cpp
index 5fae5206e..af8cd5549 100644
--- a/compiler/luci/partition/src/Nodes/CircleSVDF.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSVDF.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleScatterNd.cpp b/compiler/luci/partition/src/Nodes/CircleScatterNd.cpp
index 62912b791..88a3ecf19 100644
--- a/compiler/luci/partition/src/Nodes/CircleScatterNd.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleScatterNd.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleScatterNd.test.cpp b/compiler/luci/partition/src/Nodes/CircleScatterNd.test.cpp
index f271f8843..4ce787569 100644
--- a/compiler/luci/partition/src/Nodes/CircleScatterNd.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleScatterNd.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSegmentSum.cpp b/compiler/luci/partition/src/Nodes/CircleSegmentSum.cpp
index 5fc320a16..6540416c6 100644
--- a/compiler/luci/partition/src/Nodes/CircleSegmentSum.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSegmentSum.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSegmentSum.test.cpp b/compiler/luci/partition/src/Nodes/CircleSegmentSum.test.cpp
index a6bcff20a..453b7cc01 100644
--- a/compiler/luci/partition/src/Nodes/CircleSegmentSum.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSegmentSum.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSelect.cpp b/compiler/luci/partition/src/Nodes/CircleSelect.cpp
index dbe1dd48f..436e95609 100644
--- a/compiler/luci/partition/src/Nodes/CircleSelect.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSelect.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSelect.test.cpp b/compiler/luci/partition/src/Nodes/CircleSelect.test.cpp
index 912934b8b..2a38de593 100644
--- a/compiler/luci/partition/src/Nodes/CircleSelect.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSelect.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSelectV2.cpp b/compiler/luci/partition/src/Nodes/CircleSelectV2.cpp
index 28072c860..a8b6ab556 100644
--- a/compiler/luci/partition/src/Nodes/CircleSelectV2.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSelectV2.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSelectV2.test.cpp b/compiler/luci/partition/src/Nodes/CircleSelectV2.test.cpp
index e8d128e93..c2ebdbe11 100644
--- a/compiler/luci/partition/src/Nodes/CircleSelectV2.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSelectV2.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleShape.cpp b/compiler/luci/partition/src/Nodes/CircleShape.cpp
index f93cf1458..2fb3dcdd8 100644
--- a/compiler/luci/partition/src/Nodes/CircleShape.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleShape.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleShape.test.cpp b/compiler/luci/partition/src/Nodes/CircleShape.test.cpp
index 9b4afdcc2..38033a3bc 100644
--- a/compiler/luci/partition/src/Nodes/CircleShape.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleShape.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSin.cpp b/compiler/luci/partition/src/Nodes/CircleSin.cpp
index 62c776ef6..0ef605994 100644
--- a/compiler/luci/partition/src/Nodes/CircleSin.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSin.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSin.test.cpp b/compiler/luci/partition/src/Nodes/CircleSin.test.cpp
index fbee6f662..e141b4530 100644
--- a/compiler/luci/partition/src/Nodes/CircleSin.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSin.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSlice.cpp b/compiler/luci/partition/src/Nodes/CircleSlice.cpp
index 7895d9ece..811d81f9e 100644
--- a/compiler/luci/partition/src/Nodes/CircleSlice.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSlice.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSlice.test.cpp b/compiler/luci/partition/src/Nodes/CircleSlice.test.cpp
index 3c666ad6c..0718c7f15 100644
--- a/compiler/luci/partition/src/Nodes/CircleSlice.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSlice.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSoftmax.cpp b/compiler/luci/partition/src/Nodes/CircleSoftmax.cpp
index 0a93787e7..6b08f005e 100644
--- a/compiler/luci/partition/src/Nodes/CircleSoftmax.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSoftmax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSoftmax.test.cpp b/compiler/luci/partition/src/Nodes/CircleSoftmax.test.cpp
index b25629863..571ad80ff 100644
--- a/compiler/luci/partition/src/Nodes/CircleSoftmax.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSoftmax.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.cpp b/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.cpp
index b94948bee..dc48b36d6 100644
--- a/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.test.cpp b/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.test.cpp
index 279e9b232..0fcf22fd0 100644
--- a/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSpaceToBatchND.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.cpp b/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.cpp
index bd4523ca8..55d562f3d 100644
--- a/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.test.cpp b/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.test.cpp
index 207163d08..771c1f372 100644
--- a/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSpaceToDepth.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSparseToDense.cpp b/compiler/luci/partition/src/Nodes/CircleSparseToDense.cpp
index d1ed18818..cc2f5e915 100644
--- a/compiler/luci/partition/src/Nodes/CircleSparseToDense.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSparseToDense.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSparseToDense.test.cpp b/compiler/luci/partition/src/Nodes/CircleSparseToDense.test.cpp
index 2257186e8..06b3814ee 100644
--- a/compiler/luci/partition/src/Nodes/CircleSparseToDense.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSparseToDense.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSplit.cpp b/compiler/luci/partition/src/Nodes/CircleSplit.cpp
index d6d62a8ed..5f851f049 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplit.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplit.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSplit.test.cpp b/compiler/luci/partition/src/Nodes/CircleSplit.test.cpp
index d8d0953e0..a4242b9ab 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplit.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplit.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitOut.cpp b/compiler/luci/partition/src/Nodes/CircleSplitOut.cpp
index 4021f2042..1a447581e 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleSplitOut.test.cpp
index 85fe2685b..b7cf6fc7d 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitV.cpp b/compiler/luci/partition/src/Nodes/CircleSplitV.cpp
index f13205725..43ebe076f 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitV.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitV.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitV.test.cpp b/compiler/luci/partition/src/Nodes/CircleSplitV.test.cpp
index 3ac1d6c27..877a44759 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitV.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitV.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitVOut.cpp b/compiler/luci/partition/src/Nodes/CircleSplitVOut.cpp
index 2034805cd..4bac6c5dc 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitVOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitVOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSplitVOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleSplitVOut.test.cpp
index 434dfb0ad..b3cf4d939 100644
--- a/compiler/luci/partition/src/Nodes/CircleSplitVOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSplitVOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSqrt.cpp b/compiler/luci/partition/src/Nodes/CircleSqrt.cpp
index f737aac8d..fd6d0ec05 100644
--- a/compiler/luci/partition/src/Nodes/CircleSqrt.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSqrt.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSqrt.test.cpp b/compiler/luci/partition/src/Nodes/CircleSqrt.test.cpp
index fa7f7fe2a..be298835e 100644
--- a/compiler/luci/partition/src/Nodes/CircleSqrt.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSqrt.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSquare.cpp b/compiler/luci/partition/src/Nodes/CircleSquare.cpp
index 1476a8694..56dd5440d 100644
--- a/compiler/luci/partition/src/Nodes/CircleSquare.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSquare.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSquare.test.cpp b/compiler/luci/partition/src/Nodes/CircleSquare.test.cpp
index bb6a7c33f..a509b31b5 100644
--- a/compiler/luci/partition/src/Nodes/CircleSquare.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSquare.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp
index 40dd31706..e47be2c7e 100644
--- a/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSquaredDifference.test.cpp b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.test.cpp
index 9cfe9eefb..a900f1dc3 100644
--- a/compiler/luci/partition/src/Nodes/CircleSquaredDifference.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSqueeze.cpp b/compiler/luci/partition/src/Nodes/CircleSqueeze.cpp
index bc9fda296..ffe3c911b 100644
--- a/compiler/luci/partition/src/Nodes/CircleSqueeze.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSqueeze.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSqueeze.test.cpp b/compiler/luci/partition/src/Nodes/CircleSqueeze.test.cpp
index 1f0971043..7a6e2bf44 100644
--- a/compiler/luci/partition/src/Nodes/CircleSqueeze.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSqueeze.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/partition/src/Nodes/CircleStridedSlice.cpp
index 3bdca8a8a..953b45107 100644
--- a/compiler/luci/partition/src/Nodes/CircleStridedSlice.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleStridedSlice.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleStridedSlice.test.cpp b/compiler/luci/partition/src/Nodes/CircleStridedSlice.test.cpp
index 130ff9159..3e950fd25 100644
--- a/compiler/luci/partition/src/Nodes/CircleStridedSlice.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleStridedSlice.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSub.cpp b/compiler/luci/partition/src/Nodes/CircleSub.cpp
index 8ac294b7b..c5bea087f 100644
--- a/compiler/luci/partition/src/Nodes/CircleSub.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSub.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSub.test.cpp b/compiler/luci/partition/src/Nodes/CircleSub.test.cpp
index 7c0d83745..ca51865a7 100644
--- a/compiler/luci/partition/src/Nodes/CircleSub.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSub.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleSum.cpp b/compiler/luci/partition/src/Nodes/CircleSum.cpp
index bef1d4676..e929fd090 100644
--- a/compiler/luci/partition/src/Nodes/CircleSum.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSum.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleSum.test.cpp b/compiler/luci/partition/src/Nodes/CircleSum.test.cpp
index 1ed65c04f..21f6bbb74 100644
--- a/compiler/luci/partition/src/Nodes/CircleSum.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleSum.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTanh.cpp b/compiler/luci/partition/src/Nodes/CircleTanh.cpp
index e6c56ebf7..ef5c2c993 100644
--- a/compiler/luci/partition/src/Nodes/CircleTanh.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTanh.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTanh.test.cpp b/compiler/luci/partition/src/Nodes/CircleTanh.test.cpp
index 17cd48731..1e2d0629c 100644
--- a/compiler/luci/partition/src/Nodes/CircleTanh.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTanh.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTile.cpp b/compiler/luci/partition/src/Nodes/CircleTile.cpp
index 0381b4dac..0c217436e 100644
--- a/compiler/luci/partition/src/Nodes/CircleTile.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTile.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTile.test.cpp b/compiler/luci/partition/src/Nodes/CircleTile.test.cpp
index 79d1ba16c..9449c1fa7 100644
--- a/compiler/luci/partition/src/Nodes/CircleTile.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTile.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTopKV2.cpp b/compiler/luci/partition/src/Nodes/CircleTopKV2.cpp
index ce8a6f5df..41dfa9c22 100644
--- a/compiler/luci/partition/src/Nodes/CircleTopKV2.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTopKV2.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTopKV2.test.cpp b/compiler/luci/partition/src/Nodes/CircleTopKV2.test.cpp
index f08f3f315..e0c4a3a84 100644
--- a/compiler/luci/partition/src/Nodes/CircleTopKV2.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTopKV2.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTopKV2Out.cpp b/compiler/luci/partition/src/Nodes/CircleTopKV2Out.cpp
index 6ca6e3d29..19f0fa7bf 100644
--- a/compiler/luci/partition/src/Nodes/CircleTopKV2Out.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTopKV2Out.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTopKV2Out.test.cpp b/compiler/luci/partition/src/Nodes/CircleTopKV2Out.test.cpp
index a5c1c43f7..ba085f6a9 100644
--- a/compiler/luci/partition/src/Nodes/CircleTopKV2Out.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTopKV2Out.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTranspose.cpp b/compiler/luci/partition/src/Nodes/CircleTranspose.cpp
index 1cbb54666..cbbdb0090 100644
--- a/compiler/luci/partition/src/Nodes/CircleTranspose.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTranspose.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTranspose.test.cpp b/compiler/luci/partition/src/Nodes/CircleTranspose.test.cpp
index b3b16307c..847683844 100644
--- a/compiler/luci/partition/src/Nodes/CircleTranspose.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTranspose.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/partition/src/Nodes/CircleTransposeConv.cpp
index 469cc9a1a..6b6819d59 100644
--- a/compiler/luci/partition/src/Nodes/CircleTransposeConv.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTransposeConv.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleTransposeConv.test.cpp b/compiler/luci/partition/src/Nodes/CircleTransposeConv.test.cpp
index ee9fb0e78..68adaad81 100644
--- a/compiler/luci/partition/src/Nodes/CircleTransposeConv.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleTransposeConv.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp b/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
index 3f0374aac..332301455 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp b/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
index aeefef093..2630461ae 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleUnique.cpp b/compiler/luci/partition/src/Nodes/CircleUnique.cpp
index 79ca59466..c035b7ed7 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnique.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnique.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleUnique.test.cpp b/compiler/luci/partition/src/Nodes/CircleUnique.test.cpp
index 23f299840..910087a8b 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnique.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnique.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleUniqueOut.cpp b/compiler/luci/partition/src/Nodes/CircleUniqueOut.cpp
index f244dd6eb..23b1abaa5 100644
--- a/compiler/luci/partition/src/Nodes/CircleUniqueOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUniqueOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleUniqueOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleUniqueOut.test.cpp
index 887640790..954957497 100644
--- a/compiler/luci/partition/src/Nodes/CircleUniqueOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUniqueOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleUnpack.cpp b/compiler/luci/partition/src/Nodes/CircleUnpack.cpp
index f83c5d810..43ebcb418 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnpack.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnpack.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleUnpack.test.cpp b/compiler/luci/partition/src/Nodes/CircleUnpack.test.cpp
index b164cc3bc..444b04373 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnpack.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnpack.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleUnpackOut.cpp b/compiler/luci/partition/src/Nodes/CircleUnpackOut.cpp
index b8982fff5..ee1de153f 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnpackOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnpackOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleUnpackOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleUnpackOut.test.cpp
index 9ed440966..2aaef8d04 100644
--- a/compiler/luci/partition/src/Nodes/CircleUnpackOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleUnpackOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleVariable.cpp b/compiler/luci/partition/src/Nodes/CircleVariable.cpp
index f7f6f21fd..e7a794a16 100644
--- a/compiler/luci/partition/src/Nodes/CircleVariable.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleVariable.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace luci
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleWhere.cpp b/compiler/luci/partition/src/Nodes/CircleWhere.cpp
index 8ef274268..d0fc8465d 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhere.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhere.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleWhere.test.cpp b/compiler/luci/partition/src/Nodes/CircleWhere.test.cpp
index 942f804c2..f17131c94 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhere.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhere.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleWhile.cpp b/compiler/luci/partition/src/Nodes/CircleWhile.cpp
index 7820aca01..95b77f753 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhile.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhile.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleWhile.test.cpp b/compiler/luci/partition/src/Nodes/CircleWhile.test.cpp
index bffb7869d..6ee7aba62 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhile.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhile.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleWhileOut.cpp b/compiler/luci/partition/src/Nodes/CircleWhileOut.cpp
index 1cb4419db..5cd68355c 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhileOut.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhileOut.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleWhileOut.test.cpp b/compiler/luci/partition/src/Nodes/CircleWhileOut.test.cpp
index 901f31b01..f58eba031 100644
--- a/compiler/luci/partition/src/Nodes/CircleWhileOut.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleWhileOut.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/Nodes/CircleZerosLike.cpp b/compiler/luci/partition/src/Nodes/CircleZerosLike.cpp
index 715042d86..795d88de3 100644
--- a/compiler/luci/partition/src/Nodes/CircleZerosLike.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleZerosLike.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 namespace
 {
diff --git a/compiler/luci/partition/src/Nodes/CircleZerosLike.test.cpp b/compiler/luci/partition/src/Nodes/CircleZerosLike.test.cpp
index 74c873cb2..f887bc36f 100644
--- a/compiler/luci/partition/src/Nodes/CircleZerosLike.test.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleZerosLike.test.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "ConnectNode.test.h"
 
diff --git a/compiler/luci/partition/src/PartitionIR.cpp b/compiler/luci/partition/src/PartitionIR.cpp
index 60dc74f89..969fa7092 100644
--- a/compiler/luci/partition/src/PartitionIR.cpp
+++ b/compiler/luci/partition/src/PartitionIR.cpp
@@ -64,7 +64,7 @@ std::unique_ptr<PGroups> PGroups::make_copy(void) const
     // note: d_pgroup is now nullptr as it's moved
   }
 
-  return std::move(d_pgroups);
+  return d_pgroups;
 }
 
 GroupKey PGroups::group_of(luci::CircleNode *node) const
diff --git a/compiler/luci/partition/src/PartitionMerge.cpp b/compiler/luci/partition/src/PartitionMerge.cpp
index 4c3971bd8..aa8a827cd 100644
--- a/compiler/luci/partition/src/PartitionMerge.cpp
+++ b/compiler/luci/partition/src/PartitionMerge.cpp
@@ -255,7 +255,7 @@ std::unique_ptr<luci::PGroups> merge_pgroups(const luci::PGroups *s_pgroups)
     }
   } while (changed);
 
-  return std::move(d_pgroups);
+  return d_pgroups;
 }
 
 } // namespace luci
diff --git a/compiler/luci/partition/src/PartitionPGroups.cpp b/compiler/luci/partition/src/PartitionPGroups.cpp
index eaeacf9c4..2e95f08f7 100644
--- a/compiler/luci/partition/src/PartitionPGroups.cpp
+++ b/compiler/luci/partition/src/PartitionPGroups.cpp
@@ -257,7 +257,7 @@ std::unique_ptr<luci::PGroups> produce_pgroups(const luci::Module *source,
     }
   }
 
-  return std::move(pgroups);
+  return pgroups;
 }
 
 } // namespace luci
diff --git a/compiler/luci/partition/src/PartitionPModules.cpp b/compiler/luci/partition/src/PartitionPModules.cpp
index beaaf6093..251dbea39 100644
--- a/compiler/luci/partition/src/PartitionPModules.cpp
+++ b/compiler/luci/partition/src/PartitionPModules.cpp
@@ -15,7 +15,7 @@
  */
 
 #include "PartitionPModules.h"
-#include "ConnectNode.h"
+#include "luci/ConnectNode.h"
 
 #include "luci/Service/CircleNodeClone.h"
 #include "luci/Log.h"
@@ -156,7 +156,7 @@ std::unique_ptr<loco::Graph> clone_graph(loco::Graph *graph_org, luci::CloneCont
     add_graph_output(graph_clone, output_clone);
   }
 
-  return std::move(graph);
+  return graph;
 }
 
 void clone_recursive_subgraphs(luci::PartedModule &pm, loco::Graph *graph,
diff --git a/compiler/luci/pass/CMakeLists.txt b/compiler/luci/pass/CMakeLists.txt
index 5237c6d3f..d9d004db9 100644
--- a/compiler/luci/pass/CMakeLists.txt
+++ b/compiler/luci/pass/CMakeLists.txt
@@ -1,9 +1,16 @@
 nnas_find_package(FlatBuffers EXACT 2.0 QUIET)
+nnas_find_package(Fp16Source QUIET)
+
 if(NOT FlatBuffers_FOUND)
   message(STATUS "FlatBuffers NOT FOUND")
   return()
 endif(NOT FlatBuffers_FOUND)
 
+if(NOT Fp16Source_FOUND)
+  message(STATUS "Fp16Source NOT FOUND")
+  return()
+endif(NOT Fp16Source_FOUND)
+
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
@@ -14,6 +21,7 @@ endif(NOT LUCI_LIBRARY_TYPE)
 
 add_library(luci_pass ${LUCI_LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_pass PRIVATE src)
+target_include_directories(luci_pass PRIVATE ${Fp16Source_DIR}/include)
 target_include_directories(luci_pass PUBLIC include)
 target_link_libraries(luci_pass PUBLIC loco)
 target_link_libraries(luci_pass PUBLIC logo_core)
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index c803898f6..b94822c35 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -47,8 +47,10 @@ public:
       ResolveCustomOpBatchMatMul,
       ResolveCustomOpMatMul,
       ResolveCustomOpMaxPoolWithArgmax,
+      ResolveCustomOpSplitV,
       FoldAddV2,
       FoldCast,
+      FoldDensify,
       FoldDepthwiseConv2D,
       FoldDequantize,
       FoldGather,
@@ -61,6 +63,7 @@ public:
       ShuffleWeightTo16x1Float32,
       RemoveRedundantTranspose,
       ReplaceMulAddWithDepthwiseConv,
+      ReplaceNonConstFCWithBatchMatMul,
       ReplaceSubWithAdd,
       SubstitutePackToReshape,
       SubstitutePadV2ToPad,
diff --git a/compiler/luci/pass/include/luci/Pass/FoldDensifyPass.h b/compiler/luci/pass/include/luci/Pass/FoldDensifyPass.h
new file mode 100644
index 000000000..8ec81b1d4
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldDensifyPass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_DENSIFY_PASS_H__
+#define __LUCI_FOLD_DENSIFY_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Fold Densify if input is Sparse Constant
+ *
+ */
+struct FoldDensifyPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FoldDensifyPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_DENSIFY_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveRedundantDequantizePass.h b/compiler/luci/pass/include/luci/Pass/RemoveRedundantDequantizePass.h
new file mode 100644
index 000000000..2deb75297
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveRedundantDequantizePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_REDUNDANT_DEQUANTIZE_PASS_H__
+#define __LUCI_REMOVE_REDUNDANT_DEQUANTIZE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to remove redundant dequantize operations
+ */
+struct RemoveRedundantDequantizePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveRedundantDequantizePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_REDUNDANT_DEQUANTIZE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapeNetPass.h b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapeNetPass.h
new file mode 100644
index 000000000..19948a31c
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapeNetPass.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_UNNECESSARY_RESHAPE_NET_PASS_H__
+#define __LUCI_REMOVE_UNNECESSARY_RESHAPE_NET_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to remove unnecessary Reshape nodes.
+ * @details This class will remove unnecessary pre/post-Reshape nodes.
+ *          See https://github.com/Samsung/ONE/issues/9600 for more details.
+ */
+struct RemoveUnnecessaryReshapeNetPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveUnnecessaryReshapeNetPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_UNNECESSARY_RESHAPE_NET_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h b/compiler/luci/pass/include/luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h
new file mode 100644
index 000000000..24e16ec49
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REPLACE_NONCONST_FC_WITH_BATCH_MATMUL_PASS_H__
+#define __LUCI_REPLACE_NONCONST_FC_WITH_BATCH_MATMUL_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to replace "FC with non-const weight" with Batched MatMul
+ */
+struct ReplaceNonConstFCWithBatchMatMulPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ReplaceNonConstFCWithBatchMatMulPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REPLACE_NONCONST_FC_WITH_BATCH_MATMUL_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ResolveCustomOpSplitVPass.h b/compiler/luci/pass/include/luci/Pass/ResolveCustomOpSplitVPass.h
new file mode 100644
index 000000000..d4f0147e8
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ResolveCustomOpSplitVPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_RESOLVE_CUSTOM_OP_SPLIT_V_PASS_H__
+#define __LUCI_RESOLVE_CUSTOM_OP_SPLIT_V_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to resolve certain custom op of subgraph into splitv op in circle schema.
+ */
+struct ResolveCustomOpSplitVPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ResolveCustomOpSplitVPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_RESOLVE_CUSTOM_OP_SPLIT_V_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 6dbb22d7c..74c569d20 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -20,6 +20,7 @@
 #include "luci/Pass/ExpandBroadcastConstPass.h"
 #include "luci/Pass/FoldAddV2Pass.h"
 #include "luci/Pass/FoldCastPass.h"
+#include "luci/Pass/FoldDensifyPass.h"
 #include "luci/Pass/FoldDepthwiseConv2DPass.h"
 #include "luci/Pass/FoldDequantizePass.h"
 #include "luci/Pass/FoldGatherPass.h"
@@ -43,15 +44,18 @@
 #include "luci/Pass/RemoveRedundantTransposePass.h"
 #include "luci/Pass/RemoveRedundantQuantizePass.h"
 #include "luci/Pass/RemoveUnnecessaryReshapePass.h"
+#include "luci/Pass/RemoveUnnecessaryReshapeNetPass.h"
 #include "luci/Pass/RemoveUnnecessarySlicePass.h"
 #include "luci/Pass/RemoveUnnecessaryStridedSlicePass.h"
 #include "luci/Pass/RemoveUnnecessarySplitPass.h"
+#include "luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h"
 #include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
 #include "luci/Pass/ReplaceSubWithAddPass.h"
 #include "luci/Pass/ResolveCustomOpAddPass.h"
 #include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
 #include "luci/Pass/ResolveCustomOpMatMulPass.h"
 #include "luci/Pass/ResolveCustomOpMaxPoolWithArgmaxPass.h"
+#include "luci/Pass/ResolveCustomOpSplitVPass.h"
 #include "luci/Pass/SparsifyTensorPass.h"
 #include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
 #include "luci/Pass/SubstitutePackToReshapePass.h"
@@ -127,7 +131,8 @@ bool OptimizeOptionsImpl::query(Algorithm algo)
   return true;
 }
 
-void convert_nchw_to_nhwc(loco::Graph *g, bool preserve_input, bool preserve_output)
+// TODO Make a struct for args
+void convert_nchw_to_nhwc(loco::Graph *g, bool preserve_input, bool preserve_output, bool fuse_fc)
 {
   logo::Phase phase;
 
@@ -135,6 +140,21 @@ void convert_nchw_to_nhwc(loco::Graph *g, bool preserve_input, bool preserve_out
   phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
+  // Resolve custom Ops
+  phase.emplace_back(std::make_unique<luci::ResolveCustomOpAddPass>());
+  phase.emplace_back(std::make_unique<luci::ResolveCustomOpBatchMatMulPass>());
+  phase.emplace_back(std::make_unique<luci::ResolveCustomOpMatMulPass>());
+  phase.emplace_back(std::make_unique<luci::ResolveCustomOpMaxPoolWithArgmaxPass>());
+  phase.emplace_back(std::make_unique<luci::ResolveCustomOpSplitVPass>());
+
+  // Fuse FullyConnected with Add
+  // Why we perform FuseAddWithFullyConnectedPass before ConvertNCHWToNHWCPass?
+  // FullyConnected Op's layout is not changed in ConvertNCHWToNHWCPass, while
+  // Add Op's layer is changed from NCHW to NHWC.
+  // This disables fusion of Add and FullyConnected after ConvertNCHWToNHWC.
+  if (fuse_fc)
+    phase.emplace_back(std::make_unique<luci::FuseAddWithFullyConnectedPass>());
+
   phase.emplace_back(
     std::make_unique<luci::ConvertNCHWToNHWCPass>(preserve_input, preserve_output));
 
@@ -190,7 +210,9 @@ void CircleOptimizer::optimize(loco::Graph *g) const
     bool preserve_output =
       _options->param(Options::AlgorithmParameters::NCHW_to_NHWC_output_shape) != "true";
 
-    convert_nchw_to_nhwc(g, preserve_input, preserve_output);
+    bool fuse_fc = _options->query(Options::Algorithm::FuseAddWithFullyConnected);
+
+    convert_nchw_to_nhwc(g, preserve_input, preserve_output, fuse_fc);
   }
 
   /* TRANSFORM DECLARATION BEGIN */
@@ -220,6 +242,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::ResolveCustomOpMaxPoolWithArgmaxPass>());
   }
+  if (_options->query(Options::Algorithm::ResolveCustomOpSplitV))
+  {
+    phase.emplace_back(std::make_unique<luci::ResolveCustomOpSplitVPass>());
+  }
   if (_options->query(Options::Algorithm::FuseInstanceNorm))
   {
     phase.emplace_back(std::make_unique<FuseInstanceNormPass>());
@@ -260,6 +286,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::FoldCastPass>());
   }
+  if (_options->query(Options::Algorithm::FoldDensify))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldDensifyPass>());
+  }
   if (_options->query(Options::Algorithm::FoldDepthwiseConv2D))
   {
     phase.emplace_back(std::make_unique<luci::FoldDepthwiseConv2DPass>());
@@ -307,6 +337,7 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   if (_options->query(Options::Algorithm::RemoveUnnecessaryReshape))
   {
     phase.emplace_back(std::make_unique<luci::RemoveUnnecessaryReshapePass>());
+    phase.emplace_back(std::make_unique<luci::RemoveUnnecessaryReshapeNetPass>());
   }
   if (_options->query(Options::Algorithm::RemoveUnnecessarySlice))
   {
@@ -332,6 +363,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::RemoveRedundantQuantizePass>());
   }
+  if (_options->query(Options::Algorithm::ReplaceNonConstFCWithBatchMatMul))
+  {
+    phase.emplace_back(std::make_unique<luci::ReplaceNonConstFCWithBatchMatMulPass>());
+  }
   if (_options->query(Options::Algorithm::ReplaceMulAddWithDepthwiseConv))
   {
     phase.emplace_back(std::make_unique<luci::ReplaceMulAddWithDepthwiseConvPass>());
diff --git a/compiler/luci/pass/src/CircleQuantizer.cpp b/compiler/luci/pass/src/CircleQuantizer.cpp
index ce38a90b9..9a6550b9f 100644
--- a/compiler/luci/pass/src/CircleQuantizer.cpp
+++ b/compiler/luci/pass/src/CircleQuantizer.cpp
@@ -22,6 +22,7 @@
 #include "luci/Pass/RequantizePass.h"
 #include "luci/Pass/ConvertToFakeQuantizedModelPass.h"
 #include "luci/Pass/FoldDequantizePass.h"
+#include "luci/Pass/RemoveRedundantDequantizePass.h"
 #include "luci/Pass/QuantizePreCheckerPass.h"
 #include "luci/Pass/QuantizeWithMinMaxPass.h"
 #include "luci/Pass/QuantizeDequantizeWeightsPass.h"
@@ -252,8 +253,8 @@ void CircleQuantizer::quantize(loco::Graph *g) const
     static const std::vector<std::string> qwmm_supported_input_model_dtype{"float32"};
     static const std::vector<std::string> qwmm_supported_output_model_dtype{"uint8", "int16"};
     static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
-    static const std::vector<std::string> qwmm_supported_input_type{"uint8", "int16"};
-    static const std::vector<std::string> qwmm_supported_output_type{"uint8", "int16"};
+    static const std::vector<std::string> qwmm_supported_input_type{"uint8", "int16", "float32"};
+    static const std::vector<std::string> qwmm_supported_output_type{"uint8", "int16", "float32"};
 
     auto input_model_dtype =
       _options->param(Options::AlgorithmParameters::Quantize_input_model_dtype);
@@ -434,6 +435,8 @@ void CircleQuantizer::quantize(loco::Graph *g) const
     phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
     phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
+    // Remove redundant Dequantize Ops generated during fake quantization
+    phase.emplace_back(std::make_unique<luci::RemoveRedundantDequantizePass>());
     // Fold Dequantize Ops generated during fake quantization
     phase.emplace_back(std::make_unique<luci::FoldDequantizePass>());
 
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
index ce4f54035..55a29d105 100644
--- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
@@ -28,6 +28,69 @@
 namespace
 {
 
+// Return true if from can be broadcasted to to
+// to's shape is [N, C, H, W]
+bool broadcastable(const luci::CircleConst *from, const luci::CircleNode *to)
+{
+  assert(to->rank() == 4); // FIX_CALLER_UNLESS
+
+  const auto from_rank = from->rank();
+  if (from_rank > 4)
+    return false;
+
+  // Scalar is always broadcastable
+  if (from_rank == 0)
+    return true;
+
+  for (uint32_t i = 1; i <= from_rank; i++)
+  {
+    auto to_index = 4 - i;
+    auto from_index = from_rank - i;
+
+    if (from->dim(from_index).value() != to->dim(to_index).value() and
+        from->dim(from_index).value() != 1)
+      return false;
+  }
+
+  return true;
+}
+
+// Expand node to rank 4
+// node should have rank less than or equal to 4
+void expand_to_rank_4(luci::CircleConst *node)
+{
+  auto original_rank = node->rank();
+
+  assert(original_rank <= 4); // FIX_CALLER_UNLESS
+
+  if (original_rank == 4)
+    return;
+
+  std::vector<uint32_t> original_shape;
+  for (uint32_t i = 0; i < original_rank; i++)
+  {
+    original_shape.emplace_back(node->dim(i).value());
+  }
+
+  node->rank(4);
+  for (uint32_t i = 0; i < (4 - original_rank); i++)
+    node->dim(i) = 1;
+
+  for (uint32_t i = 0; i < original_rank; i++)
+    node->dim(i + (4 - original_rank)) = original_shape.at(i);
+}
+
+bool is_output(const loco::Node *node)
+{
+  auto cnode = loco::must_cast<const luci::CircleNode *>(node);
+  auto opcode = cnode->opcode();
+  if (opcode == luci::CircleOpcode::CIRCLEOUTPUT ||
+      opcode == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+    return true;
+
+  return false;
+}
+
 bool is_same_shape(const luci::CircleNode *node, const std::vector<loco::Dimension> &shape)
 {
   if (not node)
@@ -484,7 +547,7 @@ bool is_NCHW_with_s_const(const T *node, luci::CircleNode *&pred_node,
 //
 // Find MUL with an NCHW pattern described below
 //   - Input (non-constant) shape : [N, C, H, W]
-//   - Input (constant) shape : [1, C, 1, 1], [N, C, H, W] or a scalar (1)
+//   - Input (constant) shape : broadcastable to [N, C, H, W]
 //   - Output shape : [N, C, H, W]
 bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_node,
                         luci::CircleConst *&multiplier)
@@ -511,32 +574,12 @@ bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_nod
   if (pred_node->rank() != 4)
     return false;
 
-  const auto const_rank = multiplier->rank();
-  // Support Rank 4 or scalar (rank 0 or 1)
-  if (const_rank != 4 && const_rank != 0 && const_rank != 1)
+  if (not broadcastable(multiplier, node))
     return false;
 
-  const auto input_cdim = pred_node->dim(1);
-  const auto output_cdim = node->dim(1);
-
-  if (const_rank == 4)
-  {
-    bool supported_shape = false;
-
-    // Check multiplier is (1, C, 1, 1)
-    if (is_same_shape(multiplier, {1, node->dim(1), 1, 1}))
-      supported_shape = true;
-
-    // Check multiplier is (N, C, H, W)
-    if (is_same_shape(multiplier, {node->dim(0), node->dim(1), node->dim(2), node->dim(3)}))
-      supported_shape = true;
+  expand_to_rank_4(multiplier);
 
-    return supported_shape;
-  }
-  if (input_cdim == output_cdim)
-    return true;
-  else
-    return false;
+  return true;
 }
 
 // We assume ADD with const input is NCHW if,
@@ -569,32 +612,12 @@ bool is_NCHW_with_const(const luci::CircleAdd *node, luci::CircleNode *&pred_nod
   if (pred_node->rank() != 4)
     return false;
 
-  const auto const_rank = beta->rank();
-  // Support Rank 4 or scalar (rank 0 or 1)
-  if (const_rank != 4 && const_rank != 0 && const_rank != 1)
+  if (not broadcastable(beta, node))
     return false;
 
-  const auto input_cdim = pred_node->dim(1);
-  const auto output_cdim = node->dim(1);
-
-  if (const_rank == 4)
-  {
-    bool supported_shape = false;
-
-    // Check beta is (1, C, 1, 1)
-    if (is_same_shape(beta, {1, node->dim(1), 1, 1}))
-      supported_shape = true;
-
-    // Check beta is (N, C, H, W)
-    if (is_same_shape(beta, {node->dim(0), node->dim(1), node->dim(2), node->dim(3)}))
-      supported_shape = true;
+  expand_to_rank_4(beta);
 
-    return supported_shape;
-  }
-  if (input_cdim == output_cdim)
-    return true;
-  else
-    return false;
+  return true;
 }
 
 // We assume SUB with const input is NCHW if,
@@ -675,6 +698,24 @@ template <class T> bool convert_unary_x(T *node)
   return true;
 }
 
+template <class T> bool convert_unary_logits(T *node)
+{
+  const auto pred_node = loco::must_cast<luci::CircleNode *>(node->logits());
+  auto pre_trans = create_pre_transpose(node);
+  pre_trans->a(pred_node);
+  node->logits(pre_trans);
+
+  // Do shape inference for this node again.
+  node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  auto post_trans = create_post_transpose(node);
+  loco::replace(node).with(post_trans);
+
+  post_trans->a(node);
+
+  return true;
+}
+
 class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
 {
   // Default
@@ -742,17 +783,14 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
 
     if (is_NCHW_with_const(node, pred_node, beta))
     {
+      assert(beta->rank() == 4); // FIX is_NCHW_with_const unless
+      auto nhwc_const = create_NHWC_from_NCHW(beta);
+      if (nhwc_const == nullptr)
+        return false;
+      node->y(nhwc_const);
+
       auto pre_trans = create_pre_transpose(node);
       pre_trans->a(pred_node);
-
-      if (beta->rank() == 4)
-      {
-        auto nhwc_const = create_NHWC_from_NCHW(beta);
-        if (nhwc_const == nullptr)
-          return false;
-        node->y(nhwc_const);
-      }
-
       node->x(pre_trans);
     }
     else if (beta == nullptr)
@@ -816,6 +854,11 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
 
   bool visit(luci::CircleLogistic *node) { return convert_unary_x<luci::CircleLogistic>(node); }
 
+  bool visit(luci::CircleLogSoftmax *node)
+  {
+    return convert_unary_logits<luci::CircleLogSoftmax>(node);
+  }
+
   bool visit(luci::CircleMaximum *node)
   {
     luci::CircleNode *pred_node = nullptr;
@@ -954,15 +997,15 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
 
     if (is_NCHW_with_const(node, pred_node, multiplier))
     {
+      assert(multiplier->rank() == 4); // FIX is_NCHW_with_const unless
+      auto nhwc_const = create_NHWC_from_NCHW(multiplier);
+      if (nhwc_const == nullptr)
+        return false;
+      node->y(nhwc_const);
+
       auto pre_trans = create_pre_transpose(node);
       pre_trans->a(pred_node);
       node->x(pre_trans);
-
-      if (multiplier->rank() == 4)
-      {
-        auto nhwc_const = create_NHWC_from_NCHW(multiplier);
-        node->y(nhwc_const);
-      }
     }
     else if (multiplier == nullptr)
     {
@@ -1049,12 +1092,127 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
     return true;
   }
 
+  // TODO Reduce duplicate code with CircleMean
+  bool visit(luci::CircleReduceMax *node)
+  {
+    auto input = loco::must_cast<luci::CircleNode *>(node->input());
+    if (input->rank() != 4)
+      return false;
+
+    auto rindices = dynamic_cast<luci::CircleConst *>(node->reduction_indices());
+    if (not rindices)
+      return false;
+
+    auto nhwc_rindices = create_NHWC_rindices(rindices);
+    if (not nhwc_rindices)
+      return false;
+
+    auto pre_trans = create_pre_transpose(node);
+    pre_trans->a(input);
+    node->input(pre_trans);
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    node->reduction_indices(nhwc_rindices);
+
+    if (node->keep_dims())
+    {
+      auto post_trans = create_post_transpose(node);
+      loco::replace(node).with(post_trans);
+
+      post_trans->a(node);
+
+      return true;
+    }
+
+    // The below codes handle the cases where node->keep_dims() == false
+    // 1D output never needs a transpose
+    if (node->rank() <= 1)
+      return true;
+
+    std::vector<bool> reduced_dims_nhwc(4, false);
+    uint32_t num_reduced_indices = nhwc_rindices->size<loco::DataType::S32>();
+
+    for (uint32_t ri = 0; ri < num_reduced_indices; ++ri)
+    {
+      reduced_dims_nhwc[nhwc_rindices->at<loco::DataType::S32>(ri)] = true;
+    }
+
+    // if channel dimension has been reduced, we don't need a transpose
+    if (reduced_dims_nhwc[3])
+      return true;
+
+    // likewise, if both space dimensions are reduced, no transpose is needed
+    if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2])
+      return true;
+
+    std::vector<int32_t> post_trans_ind;
+    // case 1: only N is reduced
+    if (num_reduced_indices == 1 && reduced_dims_nhwc[0])
+      post_trans_ind = {2, 0, 1};
+
+    // case 2: only H or W is reduced
+    if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2]))
+      post_trans_ind = {0, 2, 1};
+
+    // case 3: N and either H or W are reduced
+    if (num_reduced_indices == 2)
+      post_trans_ind = {1, 0};
+
+    auto post_trans = create_Nd_transpose(node, post_trans_ind);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    return true;
+  }
+
   bool visit(luci::CircleRelu *node) { return convert_unary_features<luci::CircleRelu>(node); }
 
   bool visit(luci::CircleRelu6 *node) { return convert_unary_features<luci::CircleRelu6>(node); }
 
   bool visit(luci::CircleRsqrt *node) { return convert_unary_x<luci::CircleRsqrt>(node); }
 
+  bool visit(luci::CircleSoftmax *node) { return convert_unary_logits<luci::CircleSoftmax>(node); }
+
+  bool visit(luci::CircleSplitV *node)
+  {
+    // Change split dimension
+    auto axis = dynamic_cast<luci::CircleConst *>(node->split_dim());
+    if (not axis)
+      return false;
+
+    if (axis->dtype() != loco::DataType::S32)
+      return false;
+
+    if (axis->size<loco::DataType::S32>() != 1)
+      return false;
+
+    axis->at<loco::DataType::S32>(0) = nchw_axis_to_nhwc(axis->at<loco::DataType::S32>(0));
+
+    // Insert pre-transpose
+    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
+    auto pre_trans = create_pre_transpose(node);
+    pre_trans->a(pred_node);
+    node->input(pre_trans);
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    // Insert post-transposes
+    for (auto succ : loco::succs(node))
+    {
+      auto svo = loco::must_cast<luci::CircleSplitVOut *>(succ);
+
+      auto post_trans = create_post_transpose(svo);
+      loco::replace(svo).with(post_trans);
+      post_trans->a(svo);
+    }
+
+    return true;
+  }
+
   bool visit(luci::CircleSquaredDifference *node)
   {
     // TODO support CircleConst input
@@ -1195,6 +1353,8 @@ bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
   // pre-Transpose --- [intermediate Ops] --- post-Transpose
   //                |
   //                +--[intermediate Ops] --- post-Transpose
+  //
+  // NOTE Intermediate Ops SHOULD NOT contain pre-Transpose/Reshape
   for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
   {
     if (has_data_format(node))
@@ -1202,25 +1362,51 @@ bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
 
     if (is_pre_transpose(node) || is_pre_reshape(node))
     {
+      std::set<loco::Node *> intermediate;
+
+      // Variable to check intermediate Ops contain pre-Transpose/Reshape
+      bool has_pre = false;
+
+      // Variable to check the pattern is closed with post-Transpose/Reshape
+      bool is_closed = true;
+
       // For recursive call of lambda
-      std::function<void(loco::Node *)> set_data_format_to_succs;
-      set_data_format_to_succs = [&](loco::Node *n) {
+      std::function<void(loco::Node *)> collect_intermediate;
+      collect_intermediate = [&](loco::Node *n) {
         for (auto succ : loco::succs(n))
         {
           // Exit condition
           if (is_post_transpose(succ) || is_post_reshape(succ))
             continue;
 
-          if (not has_data_format(succ))
+          if (is_pre_transpose(succ) || is_pre_reshape(succ))
+          {
+            has_pre = true;
+            break;
+          }
+
+          if (is_output(succ))
           {
-            set_data_format(succ, DataFormat::NHWC);
+            is_closed = false;
+            break;
           }
 
-          set_data_format_to_succs(succ);
+          intermediate.emplace(succ);
+
+          collect_intermediate(succ);
         }
       };
 
-      set_data_format_to_succs(node);
+      collect_intermediate(node);
+
+      if (has_pre or not is_closed)
+        continue;
+
+      for (auto inter : intermediate)
+      {
+        if (not has_data_format(inter))
+          set_data_format(inter, DataFormat::NHWC);
+      }
     }
   }
 
@@ -1248,6 +1434,7 @@ bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
       case luci::CircleOpcode::ELU:
       case luci::CircleOpcode::LEAKY_RELU:
       case luci::CircleOpcode::LOGISTIC:
+      case luci::CircleOpcode::LOG_SOFTMAX:
       case luci::CircleOpcode::MAXIMUM:
       case luci::CircleOpcode::MEAN:
       case luci::CircleOpcode::MINIMUM:
@@ -1255,9 +1442,12 @@ bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
       case luci::CircleOpcode::NEG:
       case luci::CircleOpcode::PAD:
       case luci::CircleOpcode::PADV2:
+      case luci::CircleOpcode::REDUCE_MAX:
       case luci::CircleOpcode::RELU:
       case luci::CircleOpcode::RELU6:
       case luci::CircleOpcode::RSQRT:
+      case luci::CircleOpcode::SOFTMAX:
+      case luci::CircleOpcode::SPLIT_V:
       case luci::CircleOpcode::SQUARED_DIFFERENCE:
       case luci::CircleOpcode::SUB:
         if (!has_data_format(node))
@@ -1296,7 +1486,8 @@ bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
       if (circle_node->rank() != 4)
       {
         // TODO replace the check above with the input rank check, and remove the condition below
-        if (not dynamic_cast<luci::CircleMean *>(node))
+        if (not dynamic_cast<luci::CircleMean *>(node) and
+            not dynamic_cast<luci::CircleReduceMax *>(node))
           continue;
       }
 
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
index dd81d1380..6bb3d3268 100644
--- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
@@ -16,6 +16,8 @@
 
 #include <logo/Phase.h>
 
+#include <luci/test/TestIOGraph.h>
+
 #include "luci/Pass/ConvertNCHWToNHWCPass.h"
 #include "luci/Pass/CircleShapeInferencePass.h"
 
@@ -23,6 +25,8 @@
 
 #include <gtest/gtest.h>
 
+using namespace luci::test;
+
 namespace
 {
 
@@ -202,6 +206,173 @@ public:
   luci::CircleConst *post_shape = nullptr;
 };
 
+/**
+ *  Graph with pre-Reshape but no post-Transpose/Reshape.
+ *
+ *  BEFORE
+ *             [Input]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *              [Relu]
+ *                |
+ *             [Output]
+ *
+ *  AFTER
+ *             [Input]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *          [Pre-Transpose]
+ *                |
+ *              [Relu]
+ *                |
+ *          [Post-Transpose]
+ *                |
+ *             [Output]
+ */
+class NoPostReshapeGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    relu = g.nodes()->create<luci::CircleRelu>();
+    pre_reshape = g.nodes()->create<luci::CircleReshape>();
+    pre_shape = g.nodes()->create<luci::CircleConst>();
+
+    pre_shape->dtype(loco::DataType::S32);
+
+    uint32_t channel_size = 16;
+    auto in = loco::must_cast<luci::CircleNode *>(input);
+    in->shape({1, channel_size, 4, 4});
+    pre_shape->shape({4});
+
+    pre_shape->size<loco::DataType::S32>(4);
+    pre_shape->at<loco::DataType::S32>(0) = 1;
+    pre_shape->at<loco::DataType::S32>(1) = 4;
+    pre_shape->at<loco::DataType::S32>(2) = 4;
+    pre_shape->at<loco::DataType::S32>(3) = channel_size;
+
+    pre_reshape->tensor(input);
+    pre_reshape->shape(pre_shape);
+    relu->features(pre_reshape);
+
+    relu->name("Relu");
+    pre_reshape->name("pre-reshape");
+
+    return relu;
+  }
+
+public:
+  luci::CircleRelu *relu = nullptr;
+  luci::CircleReshape *pre_reshape = nullptr;
+  luci::CircleConst *pre_shape = nullptr;
+};
+
+/**
+ *  Graph with two pre-Reshapes
+ *
+ *  BEFORE
+ *             [Input]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *              [Relu]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *          [Post-Reshape]
+ *                |
+ *             [Output]
+ *
+ *  AFTER
+ *             [Input]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *          [Pre-Transpose]
+ *                |
+ *              [Relu]
+ *                |
+ *          [Post-Transpose]
+ *                |
+ *          [Pre-Reshape]
+ *                |
+ *          [Post-Reshape]
+ *                |
+ *             [Output]
+ */
+class ReluNotClosedGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    relu = g.nodes()->create<luci::CircleRelu>();
+    pre_reshape = g.nodes()->create<luci::CircleReshape>();
+    pre_reshape_2 = g.nodes()->create<luci::CircleReshape>();
+    post_reshape = g.nodes()->create<luci::CircleReshape>();
+    pre_shape = g.nodes()->create<luci::CircleConst>();
+    pre_shape_2 = g.nodes()->create<luci::CircleConst>();
+    post_shape = g.nodes()->create<luci::CircleConst>();
+
+    pre_shape->dtype(loco::DataType::S32);
+    pre_shape_2->dtype(loco::DataType::S32);
+    post_shape->dtype(loco::DataType::S32);
+
+    uint32_t channel_size = 16;
+    auto in = loco::must_cast<luci::CircleNode *>(input);
+    in->shape({1, channel_size, 4, 4});
+    pre_shape->shape({4});
+    pre_shape_2->shape({4});
+    post_shape->shape({4});
+
+    pre_shape->size<loco::DataType::S32>(4);
+    pre_shape->at<loco::DataType::S32>(0) = 1;
+    pre_shape->at<loco::DataType::S32>(1) = 4;
+    pre_shape->at<loco::DataType::S32>(2) = 4;
+    pre_shape->at<loco::DataType::S32>(3) = channel_size;
+
+    pre_shape_2->size<loco::DataType::S32>(4);
+    pre_shape_2->at<loco::DataType::S32>(0) = 1;
+    pre_shape_2->at<loco::DataType::S32>(1) = 4;
+    pre_shape_2->at<loco::DataType::S32>(2) = channel_size;
+    pre_shape_2->at<loco::DataType::S32>(3) = 4;
+
+    post_shape->size<loco::DataType::S32>(4);
+    post_shape->at<loco::DataType::S32>(0) = 1;
+    post_shape->at<loco::DataType::S32>(1) = 4;
+    post_shape->at<loco::DataType::S32>(2) = 4;
+    post_shape->at<loco::DataType::S32>(3) = channel_size;
+
+    pre_reshape->tensor(input);
+    pre_reshape->shape(pre_shape);
+
+    relu->features(pre_reshape);
+
+    pre_reshape_2->tensor(relu);
+    pre_reshape_2->shape(pre_shape_2);
+
+    post_reshape->tensor(pre_reshape_2);
+    post_reshape->shape(post_shape);
+
+    relu->name("Relu");
+    pre_reshape->name("pre-reshape");
+    pre_reshape->name("pre-reshape-2");
+    post_reshape->name("post-reshape");
+
+    return post_reshape;
+  }
+
+public:
+  luci::CircleRelu *relu = nullptr;
+  luci::CircleReshape *pre_reshape = nullptr;
+  luci::CircleReshape *pre_reshape_2 = nullptr;
+  luci::CircleReshape *post_reshape = nullptr;
+  luci::CircleConst *pre_shape = nullptr;
+  luci::CircleConst *pre_shape_2 = nullptr;
+  luci::CircleConst *post_shape = nullptr;
+};
+
 class AddScalarGraph final : public SimpleGraph
 {
 protected:
@@ -312,6 +483,22 @@ public:
   luci::CircleLogistic *logistic = nullptr;
 };
 
+class LogSoftmaxGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    log_softmax = g.nodes()->create<luci::CircleLogSoftmax>();
+    log_softmax->logits(input);
+    log_softmax->name("log_softmax");
+
+    return log_softmax;
+  }
+
+public:
+  luci::CircleLogSoftmax *log_softmax = nullptr;
+};
+
 class MaximumGraph final : public SimpleGraph
 {
 protected:
@@ -642,6 +829,51 @@ public:
   luci::CircleConst *const_value = nullptr;
 };
 
+class ReduceMaxGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    rm = g.nodes()->create<luci::CircleReduceMax>();
+    rindices = g.nodes()->create<luci::CircleConst>();
+
+    rm->dtype(loco::DataType::FLOAT32);
+    rindices->dtype(loco::DataType::S32);
+
+    rm->shape(_shape);
+    rindices->shape({static_cast<uint32_t>(_axes.size())});
+
+    rindices->size<loco::DataType::S32>(_axes.size());
+    for (uint32_t i = 0; i < _axes.size(); ++i)
+    {
+      rindices->at<loco::DataType::S32>(i) = _axes[i];
+    }
+
+    rm->input(input);
+    rm->reduction_indices(rindices);
+    rm->keep_dims(_keep_dims);
+
+    rm->name("reduce_max");
+    rindices->name("rindices");
+
+    return rm;
+  }
+
+public:
+  void keep_dims(bool val) { _keep_dims = val; }
+  void axes(std::vector<int32_t> val) { _axes = val; }
+  void shape(std::initializer_list<uint32_t> val) { _shape = val; }
+
+public:
+  luci::CircleReduceMax *rm = nullptr;
+  luci::CircleConst *rindices = nullptr;
+
+private:
+  bool _keep_dims = true;
+  std::vector<int32_t> _axes = {2, 3};
+  std::initializer_list<uint32_t> _shape = {1, 16, 1, 1};
+};
+
 class ReluGraph final : public SimpleGraph
 {
 protected:
@@ -690,6 +922,111 @@ public:
   luci::CircleRsqrt *rsqrt = nullptr;
 };
 
+class SoftmaxGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    softmax = g.nodes()->create<luci::CircleSoftmax>();
+    softmax->logits(input);
+    softmax->name("softmax");
+
+    return softmax;
+  }
+
+public:
+  luci::CircleSoftmax *softmax = nullptr;
+};
+
+class SplitVGraphlet
+{
+public:
+  SplitVGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    // CircleCustom(SplitV)
+    _splitv = g->nodes()->create<luci::CircleSplitV>();
+    _splitv->shape({1, 2, 2, 192});
+    _splitv->dtype(loco::DataType::FLOAT32);
+    _splitv->name("splitv");
+
+    // CircleConst
+    auto size_splits = g->nodes()->create<luci::CircleConst>();
+    size_splits->dtype(loco::DataType::S32);
+    size_splits->shape({3});
+    size_splits->size<loco::DataType::S32>(3);
+    size_splits->at<loco::DataType::S32>(0) = 32;
+    size_splits->at<loco::DataType::S32>(1) = 32;
+    size_splits->at<loco::DataType::S32>(2) = 128;
+
+    // CircleConst
+    auto split_dim = g->nodes()->create<luci::CircleConst>();
+    split_dim->dtype(loco::DataType::S32);
+    split_dim->rank(0);
+    split_dim->size<loco::DataType::S32>(1);
+    split_dim->scalar<loco::DataType::S32>() = 3;
+
+    _splitv->size_splits(size_splits);
+    _splitv->split_dim(split_dim);
+    _splitv->num_split(3);
+
+    // CircleSplitVOut
+    _splitv_out1 = g->nodes()->create<luci::CircleSplitVOut>();
+    _splitv_out1->shape({1, 2, 2, 32});
+    _splitv_out1->dtype(loco::DataType::FLOAT32);
+    _splitv_out1->index(0);
+    _splitv_out1->input(_splitv);
+    _splitv_out1->name("splitv_out1");
+
+    // CircleSplitVOut
+    _splitv_out2 = g->nodes()->create<luci::CircleSplitVOut>();
+    _splitv_out2->shape({1, 2, 2, 32});
+    _splitv_out2->dtype(loco::DataType::FLOAT32);
+    _splitv_out2->index(1);
+    _splitv_out2->input(_splitv);
+    _splitv_out2->name("splitv_out2");
+
+    // CircleSplitVOut
+    _splitv_out3 = g->nodes()->create<luci::CircleSplitVOut>();
+    _splitv_out3->shape({1, 2, 2, 128});
+    _splitv_out3->dtype(loco::DataType::FLOAT32);
+    _splitv_out3->index(2);
+    _splitv_out3->input(_splitv);
+    _splitv_out3->name("splitv_out3");
+  }
+
+public:
+  luci::CircleSplitV *splitv() { return _splitv; }
+
+protected:
+  luci::CircleSplitV *_splitv = nullptr;
+  luci::CircleSplitVOut *_splitv_out1 = nullptr;
+  luci::CircleSplitVOut *_splitv_out2 = nullptr;
+  luci::CircleSplitVOut *_splitv_out3 = nullptr;
+};
+
+class SplitVGraph : public TestIGraphlet, public TestOsGraphlet<3>, public SplitVGraphlet
+{
+public:
+  SplitVGraph() = default;
+
+  void init(void)
+  {
+    TestIGraphlet::init(g(), {1, 2, 2, 192});
+    TestOsGraphlet<3>::init(g(), {{1, 2, 2, 32}, {1, 2, 2, 32}, {1, 2, 2, 128}});
+    SplitVGraphlet::init(g());
+
+    // connect graph
+    _splitv->input(input());
+
+    output(0)->from(_splitv_out1);
+    output(1)->from(_splitv_out2);
+    output(2)->from(_splitv_out3);
+  }
+};
+
 class SquaredDifferenceGraph final : public SimpleGraph
 {
 protected:
@@ -929,8 +1266,11 @@ TEST(ConvertNCHWToNHWC, AddScalar)
 
   auto new_beta = dynamic_cast<luci::CircleConst *>(g.add->y());
   EXPECT_NE(nullptr, new_beta);
-  EXPECT_EQ(1, new_beta->rank());
+  EXPECT_EQ(4, new_beta->rank());
   EXPECT_EQ(1, new_beta->dim(0).value());
+  EXPECT_EQ(1, new_beta->dim(1).value());
+  EXPECT_EQ(1, new_beta->dim(2).value());
+  EXPECT_EQ(1, new_beta->dim(3).value());
 
   check_pre_trans(g.output->from());
 }
@@ -1017,6 +1357,26 @@ TEST(ConvertNCHWToNHWC, Logistic)
   EXPECT_EQ(16, g.logistic->dim(3).value());
 }
 
+TEST(ConvertNCHWToNHWC, LogSoftmax)
+{
+  LogSoftmaxGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.log_softmax->logits());
+
+  auto log_softmax_succs = loco::succs(g.log_softmax);
+  EXPECT_EQ(1, log_softmax_succs.size());
+  check_post_trans(*log_softmax_succs.begin());
+
+  // Check log_softmax shape
+  EXPECT_EQ(1, g.log_softmax->dim(0).value());
+  EXPECT_EQ(4, g.log_softmax->dim(1).value());
+  EXPECT_EQ(4, g.log_softmax->dim(2).value());
+  EXPECT_EQ(16, g.log_softmax->dim(3).value());
+}
+
 TEST(ConvertNCHWToNHWC, Maximum)
 {
   MaximumGraph g;
@@ -1265,8 +1625,11 @@ TEST(ConvertNCHWToNHWC, MulScalar)
 
   auto new_multiplier = dynamic_cast<luci::CircleConst *>(g.mul->y());
   EXPECT_NE(nullptr, new_multiplier);
-  EXPECT_EQ(1, new_multiplier->rank());
+  EXPECT_EQ(4, new_multiplier->rank());
   EXPECT_EQ(1, new_multiplier->dim(0).value());
+  EXPECT_EQ(1, new_multiplier->dim(1).value());
+  EXPECT_EQ(1, new_multiplier->dim(2).value());
+  EXPECT_EQ(1, new_multiplier->dim(3).value());
 
   check_pre_trans(g.output->from());
 }
@@ -1451,6 +1814,85 @@ TEST(ConvertNCHWToNHWC, Preserve_Input_Output)
   }
 }
 
+TEST(ConvertNCHWToNHWC, ReduceMax)
+{
+  ReduceMaxGraph g;
+  g.init();
+
+  run_phase(&g.g, false, false);
+
+  check_pre_trans(g.rm->input());
+
+  auto rm_succs = loco::succs(g.rm);
+  EXPECT_EQ(1, rm_succs.size());
+  check_post_trans(*rm_succs.begin());
+
+  auto new_rindices = dynamic_cast<luci::CircleConst *>(g.rm->reduction_indices());
+  EXPECT_NE(nullptr, new_rindices);
+  EXPECT_EQ(1, new_rindices->rank());
+  EXPECT_EQ(2, new_rindices->dim(0).value());
+  EXPECT_EQ(2, new_rindices->size<loco::DataType::S32>());
+  EXPECT_EQ(1, new_rindices->at<loco::DataType::S32>(0));
+  EXPECT_EQ(2, new_rindices->at<loco::DataType::S32>(1));
+}
+
+TEST(ConvertNCHWToNHWC, ReduceMax_keep_dims_false)
+{
+  struct TC
+  {
+    std::vector<int32_t> nchw_ind;
+    std::vector<int32_t> nhwc_ind;
+    std::initializer_list<uint32_t> shape;
+    bool needs_transpose = false;
+  };
+
+  uint32_t n = 1;
+  uint32_t c = 16;
+  uint32_t h = 4;
+  uint32_t w = 4;
+
+  std::vector<TC> test_cases{{{0}, {0}, {c, h, w}, true},       {{1}, {3}, {n, h, w}, false},
+                             {{2}, {1}, {n, c, w}, true},       {{3}, {2}, {n, c, h}, true},
+                             {{0, 1}, {0, 3}, {h, w}, false},   {{0, 2}, {0, 1}, {c, w}, true},
+                             {{0, 3}, {0, 2}, {c, h}, true},    {{1, 2}, {3, 1}, {n, w}, false},
+                             {{1, 3}, {3, 2}, {n, h}, false},   {{2, 3}, {1, 2}, {n, c}, false},
+                             {{0, 1, 2}, {0, 3, 1}, {w}, false}};
+
+  for (auto &tc : test_cases)
+  {
+    ReduceMaxGraph g;
+    g.keep_dims(false);
+    g.axes(tc.nchw_ind);
+    g.shape(tc.shape);
+    g.init();
+
+    run_phase(&g.g, true, true);
+
+    check_pre_trans(g.rm->input());
+
+    auto rm_succs = loco::succs(g.rm);
+    EXPECT_EQ(1, rm_succs.size());
+    if (tc.needs_transpose)
+    {
+      EXPECT_NE(nullptr, dynamic_cast<luci::CircleTranspose *>(*rm_succs.begin()));
+    }
+    else
+    {
+      EXPECT_NE(nullptr, dynamic_cast<luci::CircleOutput *>(*rm_succs.begin()));
+    }
+
+    auto new_rindices = dynamic_cast<luci::CircleConst *>(g.rm->reduction_indices());
+    EXPECT_NE(nullptr, new_rindices);
+    EXPECT_EQ(1, new_rindices->rank());
+    EXPECT_EQ(tc.nhwc_ind.size(), new_rindices->dim(0).value());
+    EXPECT_EQ(tc.nhwc_ind.size(), new_rindices->size<loco::DataType::S32>());
+    for (uint32_t i = 0; i < tc.nhwc_ind.size(); ++i)
+    {
+      EXPECT_EQ(tc.nhwc_ind[i], new_rindices->at<loco::DataType::S32>(i));
+    }
+  }
+}
+
 TEST(ConvertNCHWToNHWC, Relu)
 {
   ReluGraph g;
@@ -1511,6 +1953,57 @@ TEST(ConvertNCHWToNHWC, Rsqrt)
   EXPECT_EQ(16, g.rsqrt->dim(3).value());
 }
 
+TEST(ConvertNCHWToNHWC, Softmax)
+{
+  SoftmaxGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.softmax->logits());
+
+  auto softmax_succs = loco::succs(g.softmax);
+  EXPECT_EQ(1, softmax_succs.size());
+  check_post_trans(*softmax_succs.begin());
+
+  // Check softmax shape
+  EXPECT_EQ(1, g.softmax->dim(0).value());
+  EXPECT_EQ(4, g.softmax->dim(1).value());
+  EXPECT_EQ(4, g.softmax->dim(2).value());
+  EXPECT_EQ(16, g.softmax->dim(3).value());
+}
+
+TEST(ConvertNCHWToNHWC, SplitV)
+{
+  SplitVGraph g;
+  g.init();
+
+  run_phase(g.g(), true, true);
+
+  check_pre_trans(g.splitv()->input());
+
+  auto splitv_succs = loco::succs(g.splitv());
+  for (auto svo : loco::succs(g.splitv()))
+  {
+    for (auto succ : loco::succs(svo))
+    {
+      check_post_trans(succ);
+    }
+  }
+
+  // Check splitv() shape
+  EXPECT_EQ(1, g.splitv()->dim(0).value());
+  EXPECT_EQ(2, g.splitv()->dim(1).value());
+  EXPECT_EQ(192, g.splitv()->dim(2).value());
+  EXPECT_EQ(2, g.splitv()->dim(3).value());
+
+  // Check axis
+  auto axis = dynamic_cast<luci::CircleConst *>(g.splitv()->split_dim());
+  EXPECT_NE(nullptr, axis);
+  EXPECT_EQ(1, axis->size<loco::DataType::S32>());
+  EXPECT_EQ(2, axis->at<loco::DataType::S32>(0));
+}
+
 TEST(ConvertNCHWToNHWC, SquaredDifference)
 {
   SquaredDifferenceGraph g;
@@ -1602,3 +2095,31 @@ TEST(ConvertNCHWToNHWC, SubScalar)
 
   check_pre_trans(g.output->from());
 }
+
+TEST(ConvertNCHWToNHWC, Not_Closed_Case1_NEG)
+{
+  NoPostReshapeGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.relu->features());
+
+  auto relu_succs = loco::succs(g.relu);
+  EXPECT_EQ(1, relu_succs.size());
+  check_post_trans(*relu_succs.begin());
+}
+
+TEST(ConvertNCHWToNHWC, Not_Closed_Case2_NEG)
+{
+  ReluNotClosedGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.relu->features());
+
+  auto relu_succs = loco::succs(g.relu);
+  EXPECT_EQ(1, relu_succs.size());
+  check_post_trans(*relu_succs.begin());
+}
diff --git a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
index 11970fff5..72f590135 100644
--- a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
+++ b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
@@ -184,8 +184,63 @@ struct FakeQuantize final : public luci::CircleNodeMutableVisitor<void>
 
   // For non-const activation, insert Quantize-Dequantize Ops
   // and dequantize the node
-  void visit(luci::CircleConv2D *node) { fq_activation(node); }
   void visit(luci::CircleAdd *node) { fq_activation(node); }
+  void visit(luci::CircleAveragePool2D *node) { fq_activation(node); }
+  void visit(luci::CircleBatchMatMul *node) { fq_activation(node); }
+  void visit(luci::CircleConv2D *node) { fq_activation(node); }
+  void visit(luci::CircleDepthwiseConv2D *node) { fq_activation(node); }
+  void visit(luci::CircleDiv *node) { fq_activation(node); }
+  void visit(luci::CircleFullyConnected *node) { fq_activation(node); }
+  void visit(luci::CircleInstanceNorm *node) { fq_activation(node); }
+  void visit(luci::CircleLeakyRelu *node) { fq_activation(node); }
+  void visit(luci::CircleLogistic *node) { fq_activation(node); }
+  void visit(luci::CircleLogSoftmax *node) { fq_activation(node); }
+  void visit(luci::CircleMaxPool2D *node) { fq_activation(node); }
+  void visit(luci::CircleMul *node) { fq_activation(node); }
+  void visit(luci::CircleNeg *node) { fq_activation(node); }
+  void visit(luci::CirclePad *node) { fq_activation(node); }
+  void visit(luci::CirclePRelu *node) { fq_activation(node); }
+  void visit(luci::CircleMean *node) { fq_activation(node); }
+  void visit(luci::CircleReduceMax *node) { fq_activation(node); }
+  void visit(luci::CircleRelu *node) { fq_activation(node); }
+  void visit(luci::CircleRelu6 *node) { fq_activation(node); }
+  void visit(luci::CircleResizeBilinear *node) { fq_activation(node); }
+  void visit(luci::CircleResizeNearestNeighbor *node) { fq_activation(node); }
+  void visit(luci::CircleRsqrt *node) { fq_activation(node); }
+  void visit(luci::CircleSoftmax *node) { fq_activation(node); }
+  void visit(luci::CircleSqrt *node) { fq_activation(node); }
+  void visit(luci::CircleTanh *node) { fq_activation(node); }
+  void visit(luci::CircleTransposeConv *node) { fq_activation(node); }
+
+  // For Ops that do not change the value of input, do nothing
+  // (dtype will be automatically updated by type inference)
+  void visit(luci::CircleCast *) {}
+  void visit(luci::CircleConcatenation *) {}
+  void visit(luci::CircleGather *) {}
+  void visit(luci::CircleSlice *) {}
+  void visit(luci::CircleStridedSlice *) {}
+  void visit(luci::CircleReshape *) {}
+  void visit(luci::CircleSplit *) {}
+  void visit(luci::CircleSplitOut *) {}
+  void visit(luci::CircleSplitV *) {}
+  void visit(luci::CircleSplitVOut *) {}
+  void visit(luci::CircleTranspose *) {}
+
+  // For Ops that return index, fake quantization is unnecessary
+  void visit(luci::CircleArgMax *) {}
+
+  // Virtual node
+  void visit(luci::CircleOutputExclude *) {}
+
+  void visit(luci::CircleQuantize *node)
+  {
+    RETURN_UNLESS(is_quant_act(node));
+
+    insert_dequantize(node);
+  }
+
+  // Dequantize Op does nothing in fp32 model
+  void visit(luci::CircleDequantize *) {}
 };
 
 #undef RETURN_UNLESS
diff --git a/compiler/luci/pass/src/FoldDensifyPass.cpp b/compiler/luci/pass/src/FoldDensifyPass.cpp
new file mode 100644
index 000000000..5ddc743e5
--- /dev/null
+++ b/compiler/luci/pass/src/FoldDensifyPass.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldDensifyPass.h"
+#include "helpers/SparsityFormatConverter.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+#include <cassert>
+#include <vector>
+
+namespace
+{
+
+bool is_foldable_const(luci::CircleConst *node)
+{
+  if (node->sparsityparam() == nullptr)
+    return false;
+
+  if (node->dtype() == loco::DataType::FLOAT32)
+    return true;
+  if (node->dtype() == loco::DataType::FLOAT16)
+    return true;
+
+  return false;
+}
+
+luci::CircleConst *densified_const_node(luci::CircleConst *const_node)
+{
+  assert(const_node->sparsityparam());
+
+  auto name = const_node->name();
+  assert(name.length() > 0);
+  auto g = const_node->graph();
+  auto new_const_node = g->nodes()->create<luci::CircleConst>();
+
+  new_const_node->dtype(const_node->dtype());
+  new_const_node->rank(const_node->rank());
+
+  uint32_t dim_size = 1;
+  std::vector<int> dense_shape;
+  for (uint32_t i = 0; i < new_const_node->rank(); ++i)
+  {
+    assert(const_node->dim(i).known());
+    new_const_node->dim(i) = const_node->dim(i);
+
+    uint32_t value = const_node->dim(i).value();
+    dim_size *= value;
+    dense_shape.emplace_back(static_cast<int32_t>(value));
+  }
+
+  if (const_node->dtype() == loco::DataType::FLOAT32)
+    new_const_node->size<loco::DataType::FLOAT32>(dim_size);
+  else
+  {
+    assert(const_node->dtype() == loco::DataType::FLOAT16);
+    new_const_node->size<loco::DataType::FLOAT16>(dim_size);
+  }
+
+  new_const_node->shape_status(luci::ShapeStatus::VALID);
+  new_const_node->name(name + "_DS");
+
+  if (const_node->dtype() == loco::DataType::FLOAT32)
+  {
+    auto const_items = const_node->size<loco::DataType::FLOAT32>();
+    auto f_data = std::make_unique<float[]>(const_items);
+    for (size_t i = 0; i < const_items; ++i)
+      f_data[i] = const_node->at<loco::DataType::FLOAT32>(i);
+
+    sparsity::TfLiteSparsity sp = to_tflite_sparsity(const_node->sparsityparam());
+    sparsity::FormatConverter<float> converter(dense_shape, sp);
+    converter.SparseToDense(f_data.get());
+    const auto &data_dense = converter.GetData();
+    assert(data_dense.size() == dim_size);
+
+    for (uint32_t i = 0; i < dim_size; ++i)
+      new_const_node->at<loco::DataType::FLOAT32>(i) = data_dense[i];
+
+    luci::freeTfLiteSparsity(sp);
+  }
+  else
+  {
+    assert(const_node->dtype() == loco::DataType::FLOAT16);
+
+    auto const_items = const_node->size<loco::DataType::FLOAT16>();
+    auto f_data = std::make_unique<uint16_t[]>(const_items);
+    for (size_t i = 0; i < const_items; ++i)
+      f_data[i] = const_node->at<loco::DataType::FLOAT16>(i);
+
+    // Primitive type for FLOAT16 is UINT16
+    sparsity::TfLiteSparsity sp = to_tflite_sparsity(const_node->sparsityparam());
+    sparsity::FormatConverter<uint16_t> converter(dense_shape, sp);
+    converter.SparseToDense(f_data.get());
+    const auto &data_dense = converter.GetData();
+    assert(data_dense.size() == dim_size);
+    for (uint32_t i = 0; i < dim_size; ++i)
+      new_const_node->at<loco::DataType::FLOAT16>(i) = data_dense[i];
+
+    luci::freeTfLiteSparsity(sp);
+  }
+
+  return new_const_node;
+}
+
+/**
+ * @brief Fold Densify if input is Sparse Constant
+ */
+bool fold_densify(luci::CircleDensify *densify)
+{
+  auto const_input = dynamic_cast<luci::CircleConst *>(densify->input());
+  if (not const_input)
+    return false;
+
+  if (not is_foldable_const(const_input))
+    return false;
+
+  auto dense_const = densified_const_node(const_input);
+  assert(dense_const);
+
+  loco::replace(densify).with(dense_const);
+  luci::add_origin(dense_const, luci::composite_origin(
+                                  {luci::get_origin(densify), luci::get_origin(const_input)}));
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *
+ *    [CircleConst](sparse)
+ *         |
+ *   [CircleDensify]
+ *         |
+ *    [CircleNode]
+ *         |
+ *
+ * AFTER
+ *
+ *    [CircleConst](dense)  [CircleConst](sparse)
+ *         |                     |
+ *    [CircleNode]          [CircleDensify]
+ *         |
+ */
+bool FoldDensifyPass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto densify = dynamic_cast<luci::CircleDensify *>(node))
+    {
+      if (fold_densify(densify))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FoldDensifyPass.test.cpp b/compiler/luci/pass/src/FoldDensifyPass.test.cpp
new file mode 100644
index 000000000..2f9736f49
--- /dev/null
+++ b/compiler/luci/pass/src/FoldDensifyPass.test.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldDensifyPass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+class FoldDensifyPassGraph : public luci::ConstantFoldingAddTestGraph
+{
+public:
+  FoldDensifyPassGraph(std::initializer_list<uint32_t> shape)
+    : luci::ConstantFoldingAddTestGraph(shape, loco::DataType::FLOAT32)
+  {
+    _densify = _g.nodes()->create<luci::CircleDensify>();
+    _x = _g.nodes()->create<luci::CircleConst>();
+
+    _densify->dtype(loco::DataType::FLOAT32);
+    _x->dtype(loco::DataType::FLOAT32);
+
+    _densify->shape(shape);
+    _x->shape(shape);
+
+    _densify->input(_x);
+
+    _densify->name("densify");
+    _x->name("x");
+  }
+
+  loco::Node *createFoldedPattern() override { return _densify; }
+
+public:
+  void fill_const_dense(void)
+  {
+    uint32_t num_elems = 1;
+    for (uint32_t r = 0; r < _x->rank(); ++r)
+      num_elems *= _x->dim(r).value();
+
+    _x->size<loco::DataType::FLOAT32>(num_elems);
+    for (uint32_t i = 0; i < num_elems; i++)
+      _x->at<loco::DataType::FLOAT32>(i) = static_cast<float>(i + 1);
+  }
+
+  void fill_const_sparse(void)
+  {
+    // fill 4x4 of
+    //  [[1 0 0 0]
+    //   [0 2 0 0]
+    //   [0 0 3 0]
+    //   [0 0 0 4]]
+
+    // values of 1.0, 2.0, 3.0, 4.0
+    uint32_t udata[] = {0x3f800000, 0x40000000, 0x40400000, 0x40800000};
+    float *fdata = reinterpret_cast<float *>(udata);
+
+    _x->size<loco::DataType::FLOAT32>(4);
+    for (uint32_t i = 0; i < 4; i++)
+      _x->at<loco::DataType::FLOAT32>(i) = fdata[i];
+
+    auto sparsityparam = std::make_unique<luci::SparsityParam>();
+    sparsityparam->traversal_order = std::vector<int32_t>({0, 1});
+    sparsityparam->block_map = std::vector<int32_t>({});
+
+    auto dm0 = luci::DimMetaData(luci::DimensionType::DENSE, 4);
+
+    std::vector<int32_t> as_vec = {0, 1, 2, 3, 4};
+    std::vector<int32_t> ai_vec = {0, 1, 2, 3};
+    auto as = luci::SparseIndexVector(luci::SparseIndexVectorType::I32, as_vec);
+    auto ai = luci::SparseIndexVector(luci::SparseIndexVectorType::I32, ai_vec);
+    auto dm1 = luci::DimMetaData(luci::DimensionType::SPARSE_CSR, 0, as, ai);
+    sparsityparam->dim_metadata.emplace_back(dm0);
+    sparsityparam->dim_metadata.emplace_back(dm1);
+
+    _x->sparsityparam(std::move(sparsityparam));
+  }
+
+protected:
+  luci::CircleDensify *_densify = nullptr;
+  luci::CircleConst *_x = nullptr;
+};
+
+class FoldDensifyPassGraphTest : public FoldDensifyPassGraph, public ::testing::Test
+{
+public:
+  FoldDensifyPassGraphTest() : FoldDensifyPassGraph({4, 4}) {}
+
+  virtual void SetUp() { init(); }
+};
+
+} // namespace
+
+TEST(FoldDensifyPassGraph, name)
+{
+  luci::FoldDensifyPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(FoldDensifyPassGraphTest, no_sparsity_param_NEG)
+{
+  fill_const_dense();
+
+  luci::FoldDensifyPass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(FoldDensifyPassGraphTest, sparsity_param)
+{
+  fill_const_sparse();
+
+  luci::FoldDensifyPass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  EXPECT_EQ(2, folded_const->rank());
+  EXPECT_EQ(4, folded_const->dim(0).value());
+  EXPECT_EQ(4, folded_const->dim(1).value());
+  EXPECT_EQ(16, folded_const->size<loco::DataType::FLOAT32>());
+  for (int y = 0; y < 4; ++y)
+  {
+    for (int x = 0; x < 4; ++x)
+    {
+      float ovalue = folded_const->at<loco::DataType::FLOAT32>(y * 4 + x);
+      float fvalue = 0.0;
+      if (x == y)
+      {
+        // diagonal position
+        fvalue = static_cast<float>(y + 1);
+      }
+      EXPECT_EQ(fvalue, ovalue);
+    }
+  }
+}
diff --git a/compiler/luci/pass/src/FoldDequantizePass.cpp b/compiler/luci/pass/src/FoldDequantizePass.cpp
index 3dd4f8cea..b6526deb0 100644
--- a/compiler/luci/pass/src/FoldDequantizePass.cpp
+++ b/compiler/luci/pass/src/FoldDequantizePass.cpp
@@ -19,6 +19,8 @@
 #include <luci/IR/CircleNodes.h>
 #include <luci/Profile/CircleNodeOrigin.h>
 
+#include <fp16.h>
+
 namespace
 {
 
@@ -32,6 +34,9 @@ bool is_hybrid_kernel_supported(loco::Node *node)
 
 bool is_foldable_const(luci::CircleConst *node)
 {
+  if (node->dtype() == loco::DataType::FLOAT16)
+    return true;
+
   if (node->quantparam() == nullptr)
     return false;
 
@@ -39,17 +44,18 @@ bool is_foldable_const(luci::CircleConst *node)
     return true;
   if (node->dtype() == loco::DataType::U8)
     return true;
+  if (node->dtype() == loco::DataType::S16)
+    return true;
+  if (node->dtype() == loco::DataType::S32)
+    return true;
+  if (node->dtype() == loco::DataType::S64)
+    return true;
 
   return false;
 }
 
 luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
 {
-  if (const_node->quantparam() == nullptr)
-  {
-    throw std::runtime_error("Given constant node has no quantization parameter");
-  }
-
   auto name = const_node->name();
   assert(name.length() > 0);
   auto g = const_node->graph();
@@ -67,38 +73,70 @@ luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
   new_const_node->shape_status(luci::ShapeStatus::VALID);
   new_const_node->name(name + "_DQ");
 
+  if (const_node->dtype() == loco::DataType::FLOAT16)
+  {
+    for (uint32_t i = 0; i < new_const_node->size<loco::DataType::FLOAT32>(); ++i)
+    {
+      auto raw = const_node->at<loco::DataType::FLOAT16>(i);
+      new_const_node->at<loco::DataType::FLOAT32>(i) = fp16_ieee_to_fp32_value(raw);
+    }
+    return new_const_node;
+  }
+
+  if (const_node->quantparam() == nullptr)
+  {
+    throw std::runtime_error("Given constant node has no quantization parameter");
+  }
+
   const int32_t q_dim = const_node->quantparam()->quantized_dimension;
-  const int32_t q_dim_value = const_node->dim(q_dim).value();
+  // For scalar, q_dim_value is 1
+  // For non-scalar, q_dim_value is the size of quantized dimension
+  const int32_t q_dim_value = const_node->rank() == 0 ? 1 : const_node->dim(q_dim).value();
 
   int32_t right_count = q_dim_value;
   for (uint32_t i = q_dim + 1; i < const_node->rank(); ++i)
     right_count *= const_node->dim(i).value();
 
-  if (const_node->dtype() == loco::DataType::S8)
+  for (uint32_t i = 0; i < new_const_node->size<loco::DataType::FLOAT32>(); ++i)
   {
-    for (uint32_t i = 0; i < const_node->size<loco::DataType::S8>(); ++i)
-    {
-      uint32_t qd = (i % right_count) / (right_count / q_dim_value);
-      if (qd >= const_node->quantparam()->zerop.size())
-        qd = 0;
+    uint32_t qd = (i % right_count) / (right_count / q_dim_value);
+    if (qd >= const_node->quantparam()->zerop.size())
+      qd = 0;
 
-      new_const_node->at<loco::DataType::FLOAT32>(i) =
-        (float)(const_node->at<loco::DataType::S8>(i) - const_node->quantparam()->zerop.at(qd)) *
-        const_node->quantparam()->scale.at(qd);
-    }
-  }
-  else
-  {
-    for (uint32_t i = 0; i < const_node->size<loco::DataType::U8>(); ++i)
+    switch (const_node->dtype())
     {
-      uint32_t qd = (i % right_count) / (right_count / q_dim_value);
-      if (qd >= const_node->quantparam()->zerop.size())
-        qd = 0;
-
-      new_const_node->at<loco::DataType::FLOAT32>(i) =
-        (float)((int)const_node->at<loco::DataType::U8>(i) -
-                const_node->quantparam()->zerop.at(qd)) *
-        const_node->quantparam()->scale.at(qd);
+      case loco::DataType::S8:
+        new_const_node->at<loco::DataType::FLOAT32>(i) =
+          static_cast<float>(const_node->at<loco::DataType::S8>(i) -
+                             const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+        break;
+      case loco::DataType::S16:
+        new_const_node->at<loco::DataType::FLOAT32>(i) =
+          static_cast<float>(const_node->at<loco::DataType::S16>(i) -
+                             const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+        break;
+      case loco::DataType::S32:
+        new_const_node->at<loco::DataType::FLOAT32>(i) =
+          static_cast<float>(const_node->at<loco::DataType::S32>(i) -
+                             const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+        break;
+      case loco::DataType::S64:
+        new_const_node->at<loco::DataType::FLOAT32>(i) =
+          static_cast<float>(const_node->at<loco::DataType::S64>(i) -
+                             const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+        break;
+      case loco::DataType::U8:
+        new_const_node->at<loco::DataType::FLOAT32>(i) =
+          static_cast<float>(const_node->at<loco::DataType::U8>(i) -
+                             const_node->quantparam()->zerop.at(qd)) *
+          const_node->quantparam()->scale.at(qd);
+        break;
+      default:
+        throw std::runtime_error("Not supported dtype for FoldDequantizePass");
     }
   }
 
@@ -160,7 +198,7 @@ bool FoldDequantizePass::run(loco::Graph *g)
 {
   bool changed = false;
 
-  for (auto node : loco::all_nodes(g))
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
     if (auto circle_dequant = dynamic_cast<luci::CircleDequantize *>(node))
     {
diff --git a/compiler/luci/pass/src/FoldDequantizePass.test.cpp b/compiler/luci/pass/src/FoldDequantizePass.test.cpp
index d82a7bc87..fb5b6adc0 100644
--- a/compiler/luci/pass/src/FoldDequantizePass.test.cpp
+++ b/compiler/luci/pass/src/FoldDequantizePass.test.cpp
@@ -15,12 +15,389 @@
  */
 
 #include "luci/Pass/FoldDequantizePass.h"
+#include "PassTestGraphs.h"
 
 #include <gtest/gtest.h>
 
+namespace
+{
+
+template <loco::DataType DT>
+class FoldDequantizeTest : public luci::ConstantFoldingAddTestGraph, public ::testing::Test
+{
+public:
+  FoldDequantizeTest() : luci::ConstantFoldingAddTestGraph({2, 2, 2}, DT) {}
+
+  virtual void SetUp() { init(); }
+
+  loco::Node *createFoldedPattern() override
+  {
+    _dequantize = _g.nodes()->create<luci::CircleDequantize>();
+    _input = _g.nodes()->create<luci::CircleConst>();
+
+    _dequantize->dtype(loco::DataType::FLOAT32);
+    _input->dtype(DT);
+
+    _input->shape({2, 2, 2});
+
+    _input->size<DT>(8);
+    _input->at<DT>(0) = 0;
+    _input->at<DT>(1) = 1;
+    _input->at<DT>(2) = 2;
+    _input->at<DT>(3) = 3;
+    _input->at<DT>(4) = 4;
+    _input->at<DT>(5) = 5;
+    _input->at<DT>(6) = 6;
+    _input->at<DT>(7) = 7;
+
+    auto qparam = std::make_unique<luci::CircleQuantParam>();
+    qparam->quantized_dimension = 1;
+    qparam->scale.push_back(5.0);
+    qparam->scale.push_back(10.0);
+    qparam->zerop.push_back(1);
+    qparam->zerop.push_back(2);
+    _input->quantparam(std::move(qparam));
+
+    _dequantize->input(_input);
+
+    _dequantize->name("dequantize");
+    _input->name("input");
+
+    return _dequantize;
+  }
+
+  void createScalarPattern()
+  {
+    _input->rank(0);
+    _input->size<DT>(1);
+    _input->at<DT>(0) = 1;
+
+    auto qparam = std::make_unique<luci::CircleQuantParam>();
+    qparam->quantized_dimension = 0;
+    qparam->scale.push_back(1.0);
+    qparam->zerop.push_back(0);
+    _input->quantparam(std::move(qparam));
+  }
+
+  void createNotFoldablePattern() { _input->quantparam(nullptr); }
+
+protected:
+  luci::CircleDequantize *_dequantize = nullptr;
+  luci::CircleConst *_input = nullptr;
+};
+
+class S8FoldDequantizeTest : public FoldDequantizeTest<loco::DataType::S8>
+{
+};
+
+class S16FoldDequantizeTest : public FoldDequantizeTest<loco::DataType::S16>
+{
+};
+
+class S32FoldDequantizeTest : public FoldDequantizeTest<loco::DataType::S32>
+{
+};
+
+class S64FoldDequantizeTest : public FoldDequantizeTest<loco::DataType::S64>
+{
+};
+
+class U8FoldDequantizeTest : public FoldDequantizeTest<loco::DataType::U8>
+{
+};
+
+class F16FoldDequantizeTest : public luci::ConstantFoldingTestGraph, public ::testing::Test
+{
+public:
+  F16FoldDequantizeTest() : ConstantFoldingTestGraph({2, 2}, loco::DataType::FLOAT16) {}
+
+  virtual void SetUp() { init(); }
+
+  loco::Node *createFoldedPattern() override
+  {
+    const auto DT = loco::DataType::FLOAT16;
+    _dequantize = _g.nodes()->create<luci::CircleDequantize>();
+    _f16const = _g.nodes()->create<luci::CircleConst>();
+
+    _dequantize->dtype(loco::DataType::FLOAT32);
+    _f16const->dtype(DT);
+
+    _f16const->shape({2, 2});
+
+    _f16const->size<loco::DataType::FLOAT16>(4);
+    _f16const->at<DT>(0) = 49408; // -2.5f
+    _f16const->at<DT>(1) = 47104; // -0.5f
+    _f16const->at<DT>(2) = 0;     //  0.0f
+    _f16const->at<DT>(3) = 15872; //  1.5f
+    // NOTE how to get uint16_t value of float16 ?
+    // Use compiler/souschef/src/Gaussian.cpp GaussianFloat16DataChef::generate()
+    //   uint16_t value = fp16_ieee_from_fp32_value(-2.5);
+    //   printf("-2.5 = %u\r\n", value);
+
+    _dequantize->input(_f16const);
+
+    _dequantize->name("dequantize");
+    _f16const->name("input");
+
+    _output->from(_dequantize);
+
+    return _dequantize;
+  }
+
+  void createNotFoldablePattern() { _dequantize->input(_input); }
+
+protected:
+  luci::CircleConst *getFoldedPattern() override
+  {
+    return dynamic_cast<luci::CircleConst *>(_output->from());
+  }
+
+  void init() override { createFoldedPattern(); }
+
+protected:
+  luci::CircleDequantize *_dequantize = nullptr;
+  luci::CircleConst *_f16const = nullptr;
+};
+
+} // namespace
+
 TEST(FoldDequantizePassTest, name)
 {
   luci::FoldDequantizePass pass;
   auto const name = pass.name();
   ASSERT_NE(nullptr, name);
 }
+
+TEST_F(U8FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(3, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(2, folded_const->dim(2).value());
+  EXPECT_EQ(-5.0, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(10.0, folded_const->at<loco::DataType::FLOAT32>(3));
+  EXPECT_EQ(15.0, folded_const->at<loco::DataType::FLOAT32>(4));
+  EXPECT_EQ(20.0, folded_const->at<loco::DataType::FLOAT32>(5));
+  EXPECT_EQ(40.0, folded_const->at<loco::DataType::FLOAT32>(6));
+  EXPECT_EQ(50.0, folded_const->at<loco::DataType::FLOAT32>(7));
+}
+
+TEST_F(U8FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(S8FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(3, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(2, folded_const->dim(2).value());
+  EXPECT_EQ(-5.0, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(10.0, folded_const->at<loco::DataType::FLOAT32>(3));
+  EXPECT_EQ(15.0, folded_const->at<loco::DataType::FLOAT32>(4));
+  EXPECT_EQ(20.0, folded_const->at<loco::DataType::FLOAT32>(5));
+  EXPECT_EQ(40.0, folded_const->at<loco::DataType::FLOAT32>(6));
+  EXPECT_EQ(50.0, folded_const->at<loco::DataType::FLOAT32>(7));
+}
+
+TEST_F(S8FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(S16FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(3, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(2, folded_const->dim(2).value());
+  EXPECT_EQ(-5.0, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(10.0, folded_const->at<loco::DataType::FLOAT32>(3));
+  EXPECT_EQ(15.0, folded_const->at<loco::DataType::FLOAT32>(4));
+  EXPECT_EQ(20.0, folded_const->at<loco::DataType::FLOAT32>(5));
+  EXPECT_EQ(40.0, folded_const->at<loco::DataType::FLOAT32>(6));
+  EXPECT_EQ(50.0, folded_const->at<loco::DataType::FLOAT32>(7));
+}
+
+TEST_F(S16FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(S32FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(3, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(2, folded_const->dim(2).value());
+  EXPECT_EQ(-5.0, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(10.0, folded_const->at<loco::DataType::FLOAT32>(3));
+  EXPECT_EQ(15.0, folded_const->at<loco::DataType::FLOAT32>(4));
+  EXPECT_EQ(20.0, folded_const->at<loco::DataType::FLOAT32>(5));
+  EXPECT_EQ(40.0, folded_const->at<loco::DataType::FLOAT32>(6));
+  EXPECT_EQ(50.0, folded_const->at<loco::DataType::FLOAT32>(7));
+}
+
+TEST_F(S32FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(S64FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(3, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(2, folded_const->dim(2).value());
+  EXPECT_EQ(-5.0, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(10.0, folded_const->at<loco::DataType::FLOAT32>(3));
+  EXPECT_EQ(15.0, folded_const->at<loco::DataType::FLOAT32>(4));
+  EXPECT_EQ(20.0, folded_const->at<loco::DataType::FLOAT32>(5));
+  EXPECT_EQ(40.0, folded_const->at<loco::DataType::FLOAT32>(6));
+  EXPECT_EQ(50.0, folded_const->at<loco::DataType::FLOAT32>(7));
+}
+
+TEST_F(S64FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
+
+TEST_F(U8FoldDequantizeTest, fold_dequant_scalar)
+{
+  createScalarPattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Check type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(0, folded_const->rank());
+  EXPECT_EQ(1.0, folded_const->at<loco::DataType::FLOAT32>(0));
+}
+
+TEST_F(F16FoldDequantizeTest, fold_dequant_basic)
+{
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::FLOAT32, folded_const->dtype());
+  EXPECT_EQ(2, folded_const->rank());
+  EXPECT_EQ(2, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->dim(1).value());
+  EXPECT_EQ(-2.5, folded_const->at<loco::DataType::FLOAT32>(0));
+  EXPECT_EQ(-0.5, folded_const->at<loco::DataType::FLOAT32>(1));
+  EXPECT_EQ(0.0, folded_const->at<loco::DataType::FLOAT32>(2));
+  EXPECT_EQ(1.5, folded_const->at<loco::DataType::FLOAT32>(3));
+}
+
+TEST_F(F16FoldDequantizeTest, fold_dequant_basic_NEG)
+{
+  createNotFoldablePattern();
+
+  luci::FoldDequantizePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(nullptr, folded_const);
+}
diff --git a/compiler/luci/pass/src/FoldSparseToDensePass.cpp b/compiler/luci/pass/src/FoldSparseToDensePass.cpp
index 0c6fc43ed..ed60d8899 100644
--- a/compiler/luci/pass/src/FoldSparseToDensePass.cpp
+++ b/compiler/luci/pass/src/FoldSparseToDensePass.cpp
@@ -19,6 +19,8 @@
 
 #include <luci/IR/CircleNodes.h>
 
+#include <limits>
+
 namespace
 {
 
diff --git a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp
index 2c990f0a5..bc09abee2 100644
--- a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp
+++ b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp
@@ -22,6 +22,7 @@
 #include <luci/Profile/CircleNodeOrigin.h>
 #include <luci/Service/CircleShapeInference.h>
 #include <luci/Service/Nodes/CircleConst.h>
+#include <luci/Service/CircleNodeClone.h>
 
 namespace
 {
@@ -55,6 +56,26 @@ void copy_shape(luci::CircleReshape *reshape, luci::CircleReshape *new_reshape)
     new_reshape->newShape()->dim(r) = reshape->newShape()->dim(r);
 }
 
+luci::CircleReshape *create_cloned_reshape(luci::CircleReshape *reshape)
+{
+  assert(reshape != nullptr); // FIX_CALLER_UNLESS
+
+  luci::CircleConst *cloned_shape = clone_shape(reshape);
+  if (cloned_shape == nullptr)
+    return nullptr;
+
+  auto cloned_node = luci::clone_node(reshape, reshape->graph());
+  if (cloned_node == nullptr)
+    return nullptr;
+
+  auto new_reshape = loco::must_cast<luci::CircleReshape *>(cloned_node);
+  new_reshape->shape(cloned_shape);
+  new_reshape->name(reshape->name() + "_C");
+  luci::add_origin(new_reshape, luci::get_origin(reshape));
+
+  return new_reshape;
+}
+
 bool forward_reshape(luci::CircleReshape *reshape, luci::CircleNeg *neg)
 {
   assert(reshape != nullptr);
@@ -85,6 +106,26 @@ bool forward_reshape(luci::CircleReshape *reshape, luci::CircleNeg *neg)
   return true;
 }
 
+bool forward_reshape(luci::CircleReshape *reshape, luci::CircleLogistic *logit)
+{
+  assert(reshape != nullptr); // FIX_CALLER_UNLESS
+  assert(logit != nullptr);   // FIX_CALLER_UNLESS
+
+  auto new_reshape = create_cloned_reshape(reshape);
+  if (not new_reshape)
+    return false;
+
+  // reconnect network
+  loco::replace(logit).with(new_reshape);
+  logit->x(reshape->tensor());
+  new_reshape->tensor(logit);
+
+  // Do shape inference for this node again.
+  logit->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  return true;
+}
+
 class ForwardReshape final : public luci::CircleNodeMutableVisitor<bool>
 {
 protected:
@@ -103,6 +144,14 @@ protected:
     return forward_reshape(reshape, node);
   }
 
+  bool visit(luci::CircleLogistic *node)
+  {
+    auto reshape = as_reshape(node->x());
+    if (reshape == nullptr)
+      return false;
+
+    return forward_reshape(reshape, node);
+  }
   // TODO add more unary operators
 };
 
diff --git a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp
index 2593a014c..373513270 100644
--- a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp
+++ b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp
@@ -65,6 +65,42 @@ protected:
   luci::CircleConst *_reshape_shape = nullptr;
 };
 
+// TODO Reduce duplicate code with ReshapeNegGraphlet
+class ReshapeLogisticGraphlet
+{
+public:
+  ReshapeLogisticGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    std::vector<uint32_t> shape_out_v = shape_out;
+
+    _reshape_shape = g->nodes()->create<luci::CircleConst>();
+    _reshape = g->nodes()->create<luci::CircleReshape>();
+    _logistic = g->nodes()->create<luci::CircleLogistic>();
+
+    _reshape_shape->dtype(loco::DataType::S32);
+    _reshape_shape->rank(1);
+    _reshape_shape->dim(0).set(shape_out_v.size());
+    _reshape_shape->shape_status(luci::ShapeStatus::VALID);
+    // values
+    const auto size = shape_out_v.size();
+    _reshape_shape->size<loco::DataType::S32>(size);
+    for (uint32_t i = 0; i < size; i++)
+      _reshape_shape->at<loco::DataType::S32>(i) = shape_out_v[i];
+
+    _reshape_shape->name("reshape_shape");
+    _reshape->name("reshape");
+    _logistic->name("logistic");
+  }
+
+protected:
+  luci::CircleReshape *_reshape = nullptr;
+  luci::CircleLogistic *_logistic = nullptr;
+  luci::CircleConst *_reshape_shape = nullptr;
+};
+
 class ForwardReshapeToNegGraph : public TestIOGraph, public ReshapeNegGraphlet
 {
 public:
@@ -85,6 +121,26 @@ public:
   }
 };
 
+class ForwardReshapeToLogisticGraph : public TestIOGraph, public ReshapeLogisticGraphlet
+{
+public:
+  ForwardReshapeToLogisticGraph() = default;
+
+public:
+  void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIOGraph::init(shape_in, shape_out);
+    ReshapeLogisticGraphlet::init(g(), shape_in, shape_out);
+
+    // connect network
+    _reshape->tensor(input());
+    _reshape->shape(_reshape_shape);
+    _logistic->x(_reshape);
+
+    output()->from(_logistic);
+  }
+};
+
 class ForwardReshapeToNegGraphTest : public ::testing::Test
 {
 public:
@@ -101,6 +157,22 @@ protected:
   luci::ForwardReshapeToUnaryOpPass _pass;
 };
 
+class ForwardReshapeToLogisticGraphTest : public ::testing::Test
+{
+public:
+  ForwardReshapeToLogisticGraphTest() = default;
+
+  void run_pass(void)
+  {
+    while (_pass.run(_graph.g()))
+      ;
+  }
+
+protected:
+  ForwardReshapeToLogisticGraph _graph;
+  luci::ForwardReshapeToUnaryOpPass _pass;
+};
+
 } // namespace
 
 TEST(ForwardReshapeToUnaryOpPassTest, name)
@@ -123,3 +195,17 @@ TEST_F(ForwardReshapeToNegGraphTest, simple_forward)
   neg = dynamic_cast<luci::CircleNeg *>(reshape->tensor());
   ASSERT_NE(nullptr, neg);
 }
+
+TEST_F(ForwardReshapeToLogisticGraphTest, forward)
+{
+  _graph.init({2, 2, 2}, {2, 4});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto log = dynamic_cast<luci::CircleLogistic *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, log);
+  log = dynamic_cast<luci::CircleLogistic *>(reshape->tensor());
+  ASSERT_NE(nullptr, log);
+}
diff --git a/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp
index 97a962cb6..3cf31ed10 100644
--- a/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp
@@ -99,6 +99,12 @@ bool fuse_add_with_fc(luci::CircleFullyConnected *fc)
       fused_bias->at<loco::DataType::FLOAT32>(i) += const_bias->at<loco::DataType::FLOAT32>(i);
   }
 
+  // At this point, it is guarateed that fused_bias's shape is [1, 1, ..., N] or [N]
+  // where N is weights->dim(0).
+  // The shape is normalized to [N] to become the bias of FC
+  fused_bias->rank(1);
+  fused_bias->dim(0) = weights->dim(0);
+
   fc->bias(fused_bias);
   fc->fusedActivationFunction(add->fusedActivationFunction());
 
diff --git a/compiler/luci/pass/src/FuseAddWithTConvPass.cpp b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
index 2bca57014..852bc8b63 100644
--- a/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
+++ b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
@@ -37,10 +37,10 @@ namespace
  *             \    |
  *         [CircleTransposeConv]   [CircleAdd]
  *                  |
- *            ([CircleRelu6])
+ *          ([CircleRelu/Relu6])
  *                  |
  *
- *  Note: CircleRelu6 is inserted if Add activation is ReLU6
+ *  Note: CircleRelu/Relu6 is inserted if Add activation is ReLU6
  */
 bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
 {
@@ -65,7 +65,8 @@ bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
   if (add->dtype() != loco::DataType::FLOAT32)
     return false;
   if (add->fusedActivationFunction() != luci::FusedActFunc::NONE &&
-      add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
+      add->fusedActivationFunction() != luci::FusedActFunc::RELU6 &&
+      add->fusedActivationFunction() != luci::FusedActFunc::RELU)
     return false;
 
   // get addition
@@ -102,6 +103,19 @@ bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
     // remove add node
     replace(add).with(relu);
   }
+  else if (add->fusedActivationFunction() == luci::FusedActFunc::RELU)
+  {
+    auto name = addition->name();
+    assert(name.length() > 0);
+    // separate relu op from add op
+    auto relu = add->graph()->nodes()->create<luci::CircleRelu>();
+    relu->features(tconv);
+    relu->name(name + "/Relu");
+    luci::add_origin(relu, luci::get_origin(add));
+
+    // remove add node
+    replace(add).with(relu);
+  }
   else
   {
     replace(add).with(tconv);
diff --git a/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp
index 337954960..e6b54df36 100644
--- a/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp
+++ b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp
@@ -29,7 +29,7 @@ namespace
  *  NOTE TF's BatchNormalization is converted to Mul and Add.
  *
  *  BEFORE
- *                     |   [CircleOutputExclude]
+ *                     |   [CircleConst]/[CircleOutputExclude]
  *                     |   / [CircleConst]
  *                     |  / /
  *     [CircleTransposeConv]  [CircleConst]
@@ -40,7 +40,7 @@ namespace
  *                     |
  *
  *  AFTER
- *                     |                                          [CircleOutputExclude]
+ *                     |                                         [CircleConst]/[CircleOutputExclude]
  *                     +-------------------------------------+   / [CircleConst]
  *                     |                                     |  / /
  *                     |                     [CircleTransposeConv]  [CircleConst]
@@ -69,9 +69,10 @@ bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
     return false;
 
   // check scale and shift constant attributes
-  if (scale->rank() != 1)
+  // TODO maybe rank check is not needed
+  if (scale->rank() != 1 && scale->rank() != 4)
     return false;
-  if (shift->rank() != 1)
+  if (shift->rank() != 1 && shift->rank() != 4)
     return false;
   // check mul, add attributes
   if (mul->dtype() != loco::DataType::FLOAT32)
@@ -82,9 +83,8 @@ bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
       add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
     return false;
 
-  // tconv bias should be not set
-  if (not dynamic_cast<luci::CircleOutputExclude *>(tconv->bias()))
-    return false;
+  // tconv bias is optional
+  auto bias = dynamic_cast<luci::CircleConst *>(tconv->bias());
 
   // get weight of tconv
   auto filter = dynamic_cast<luci::CircleConst *>(tconv->filter());
@@ -96,10 +96,36 @@ bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
     return false;
 
   auto filter_out_chn = filter->dim(0).value();
-  if (filter_out_chn != scale->dim(0).value())
+  // allow scale/shift and bias shape of [N], [1,1,1,N]; BN works for "channel-wise"
+  auto srank = scale->rank() - 1;
+  if (filter_out_chn != scale->dim(srank).value())
     return false;
-  if (filter_out_chn != shift->dim(0).value())
+  for (uint32_t d = 0; d < srank; ++d)
+  {
+    if (1 != scale->dim(d).value())
+      return false;
+  }
+  srank = shift->rank() - 1;
+  if (filter_out_chn != shift->dim(srank).value())
     return false;
+  for (uint32_t d = 0; d < srank; ++d)
+  {
+    if (1 != shift->dim(d).value())
+      return false;
+  }
+  if (bias)
+  {
+    if (bias->dtype() != loco::DataType::FLOAT32)
+      return false;
+    srank = bias->rank() - 1;
+    if (filter_out_chn != bias->dim(srank).value())
+      return false;
+    for (uint32_t d = 0; d < srank; ++d)
+    {
+      if (1 != bias->dim(d).value())
+        return false;
+    }
+  }
 
   auto name = add->name();
   assert(name.length() > 0);
@@ -151,6 +177,11 @@ bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
   for (uint32_t c = 0; c < filter_out_chn; ++c)
   {
     fused_bias->at<loco::DataType::FLOAT32>(c) = shift->at<loco::DataType::FLOAT32>(c);
+    if (bias != nullptr)
+    {
+      fused_bias->at<loco::DataType::FLOAT32>(c) +=
+        bias->at<loco::DataType::FLOAT32>(c) * scale->at<loco::DataType::FLOAT32>(c);
+    }
   }
   fused_bias->name(name + "/TransposeConv/bias");
 
@@ -166,6 +197,10 @@ bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
   luci::add_origin(fused_tconv,
                    luci::composite_origin(
                      {luci::get_origin(add), luci::get_origin(mul), luci::get_origin(tconv)}));
+  if (bias != nullptr)
+  {
+    luci::add_origin(fused_tconv, luci::get_origin(bias));
+  }
 
   if (add->fusedActivationFunction() == luci::FusedActFunc::RELU6)
   {
diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
index f3ec6cd9e..10a651e35 100644
--- a/compiler/luci/pass/src/FuseInstanceNormPass.cpp
+++ b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
@@ -325,6 +325,10 @@ public:
   }
 
 private:
+  bool condition_common_1_5(uint32_t ifm_channel_depth);
+  bool condition_common_3_4();
+
+private:
   template <enum PatternVersion> bool match();
 
 public:
@@ -368,21 +372,8 @@ private:
   if (not(condition))             \
     return false;
 
-template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion::Version_1>()
+bool InstanceNormPattern::condition_common_1_5(uint32_t ifm_channel_depth)
 {
-  CHECK_OR_FALSE(luci::fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
-  CHECK_OR_FALSE(luci::fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
-
-  auto ifm_circle = loco::must_cast<luci::CircleNode *>(ifm);
-  CHECK_OR_FALSE(ifm_circle->shape_status() == luci::ShapeStatus::VALID);
-  CHECK_OR_FALSE(ifm_circle->rank() == 4);
-  CHECK_OR_FALSE(ifm_circle->dim(3).known());
-  uint32_t ifm_channel_depth = ifm_circle->dim(3).value();
-
-  CHECK_OR_FALSE(luci::fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma));
-
-  CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth));
-
   add_as_variance = dynamic_cast<luci::CircleAdd *>(rsqrt->x());
   CHECK_OR_FALSE(add_as_variance);
 
@@ -408,6 +399,70 @@ template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion:
   CHECK_OR_FALSE(const_as_beta);
   CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth));
 
+  return true;
+}
+
+bool InstanceNormPattern::condition_common_3_4()
+{
+  // check left sub
+  ifm = sub->x();
+  CHECK_OR_FALSE(ifm);
+
+  luci::CircleNode *ifm_node = loco::must_cast<luci::CircleNode *>(ifm);
+  CHECK_OR_FALSE(ifm_node->rank() == 4);
+  CHECK_OR_FALSE(ifm_node->dim(3).known());
+
+  mean_of_ifm = dynamic_cast<luci::CircleMean *>(sub->y());
+  CHECK_OR_FALSE(mean_of_ifm);
+  CHECK_OR_FALSE(ifm == mean_of_ifm->input());
+
+  // continue search from add_as_variance
+  CHECK_OR_FALSE(luci::fill(&sqrt, &const_as_epsilon).with_commutative_args_of(add_as_variance));
+  CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
+  // TODO Support regarding broadcast
+  CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
+
+  mean_as_variance = dynamic_cast<luci::CircleMean *>(sqrt->x());
+  CHECK_OR_FALSE(mean_as_variance);
+
+  square = dynamic_cast<luci::CircleSquare *>(mean_as_variance->input());
+  CHECK_OR_FALSE(square);
+
+  sub_2 = dynamic_cast<luci::CircleSub *>(square->x());
+  CHECK_OR_FALSE(sub_2);
+  CHECK_OR_FALSE(ifm == sub_2->x());
+
+  mean_of_ifm_2 = dynamic_cast<luci::CircleMean *>(sub_2->y());
+  CHECK_OR_FALSE(mean_of_ifm_2);
+  CHECK_OR_FALSE(ifm == mean_of_ifm_2->input());
+
+  loco::Node *ifm_should_be = nullptr;
+  luci::CircleMean *mean_of_ifm_2_should_be = nullptr;
+  CHECK_OR_FALSE(
+    luci::fill(&ifm_should_be, &mean_of_ifm_2_should_be).with_commutative_args_of(sub_2));
+  CHECK_OR_FALSE(ifm == ifm_should_be);
+  CHECK_OR_FALSE(mean_of_ifm_2 == mean_of_ifm_2_should_be);
+
+  return true;
+}
+
+template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion::Version_1>()
+{
+  CHECK_OR_FALSE(luci::fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
+  CHECK_OR_FALSE(luci::fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
+
+  auto ifm_circle = loco::must_cast<luci::CircleNode *>(ifm);
+  CHECK_OR_FALSE(ifm_circle->shape_status() == luci::ShapeStatus::VALID);
+  CHECK_OR_FALSE(ifm_circle->rank() == 4);
+  CHECK_OR_FALSE(ifm_circle->dim(3).known());
+  uint32_t ifm_channel_depth = ifm_circle->dim(3).value();
+
+  CHECK_OR_FALSE(luci::fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma));
+
+  CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth));
+
+  CHECK_OR_FALSE(condition_common_1_5(ifm_channel_depth));
+
   luci::CircleMul *mul_gamma_should_be = nullptr;
   luci::CircleMean *mean_of_ifm_should_be = nullptr;
 
@@ -488,44 +543,7 @@ template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion:
   CHECK_OR_FALSE(luci::fill(&div, &const_as_gamma).with_commutative_args_of(mul_gamma));
   CHECK_OR_FALSE(luci::fill(&sub, &add_as_variance).with_commutative_args_of(div));
 
-  // check left sub
-  ifm = sub->x();
-  CHECK_OR_FALSE(ifm);
-
-  luci::CircleNode *ifm_node = loco::must_cast<luci::CircleNode *>(ifm);
-  CHECK_OR_FALSE(ifm_node->rank() == 4);
-  CHECK_OR_FALSE(ifm_node->dim(3).known());
-
-  mean_of_ifm = dynamic_cast<luci::CircleMean *>(sub->y());
-  CHECK_OR_FALSE(mean_of_ifm);
-  CHECK_OR_FALSE(ifm == mean_of_ifm->input());
-
-  // continue search from add_as_variance
-  CHECK_OR_FALSE(luci::fill(&sqrt, &const_as_epsilon).with_commutative_args_of(add_as_variance));
-  CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
-  // TODO Support regarding broadcast
-  CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
-
-  mean_as_variance = dynamic_cast<luci::CircleMean *>(sqrt->x());
-  CHECK_OR_FALSE(mean_as_variance);
-
-  square = dynamic_cast<luci::CircleSquare *>(mean_as_variance->input());
-  CHECK_OR_FALSE(square);
-
-  sub_2 = dynamic_cast<luci::CircleSub *>(square->x());
-  CHECK_OR_FALSE(sub_2);
-  CHECK_OR_FALSE(ifm == sub_2->x());
-
-  mean_of_ifm_2 = dynamic_cast<luci::CircleMean *>(sub_2->y());
-  CHECK_OR_FALSE(mean_of_ifm_2);
-  CHECK_OR_FALSE(ifm == mean_of_ifm_2->input());
-
-  loco::Node *ifm_should_be = nullptr;
-  luci::CircleMean *mean_of_ifm_2_should_be = nullptr;
-  CHECK_OR_FALSE(
-    luci::fill(&ifm_should_be, &mean_of_ifm_2_should_be).with_commutative_args_of(sub_2));
-  CHECK_OR_FALSE(ifm == ifm_should_be);
-  CHECK_OR_FALSE(mean_of_ifm_2 == mean_of_ifm_2_should_be);
+  CHECK_OR_FALSE(condition_common_3_4());
 
   _matched = true;
   return true;
@@ -546,44 +564,7 @@ template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion:
   CHECK_OR_FALSE(div);
   CHECK_OR_FALSE(luci::fill(&sub, &add_as_variance).with_commutative_args_of(div));
 
-  // check left sub
-  ifm = sub->x();
-  CHECK_OR_FALSE(ifm);
-
-  luci::CircleNode *ifm_node = loco::must_cast<luci::CircleNode *>(ifm);
-  CHECK_OR_FALSE(ifm_node->rank() == 4);
-  CHECK_OR_FALSE(ifm_node->dim(3).known());
-
-  mean_of_ifm = dynamic_cast<luci::CircleMean *>(sub->y());
-  CHECK_OR_FALSE(mean_of_ifm);
-  CHECK_OR_FALSE(ifm == mean_of_ifm->input());
-
-  // continue search from add_as_variance
-  CHECK_OR_FALSE(luci::fill(&sqrt, &const_as_epsilon).with_commutative_args_of(add_as_variance));
-  CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
-  // TODO Support regarding broadcast
-  CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
-
-  mean_as_variance = dynamic_cast<luci::CircleMean *>(sqrt->x());
-  CHECK_OR_FALSE(mean_as_variance);
-
-  square = dynamic_cast<luci::CircleSquare *>(mean_as_variance->input());
-  CHECK_OR_FALSE(square);
-
-  sub_2 = dynamic_cast<luci::CircleSub *>(square->x());
-  CHECK_OR_FALSE(sub_2);
-  CHECK_OR_FALSE(ifm == sub_2->x());
-
-  mean_of_ifm_2 = dynamic_cast<luci::CircleMean *>(sub_2->y());
-  CHECK_OR_FALSE(mean_of_ifm_2);
-  CHECK_OR_FALSE(ifm == mean_of_ifm_2->input());
-
-  loco::Node *ifm_should_be = nullptr;
-  luci::CircleMean *mean_of_ifm_2_should_be = nullptr;
-  CHECK_OR_FALSE(
-    luci::fill(&ifm_should_be, &mean_of_ifm_2_should_be).with_commutative_args_of(sub_2));
-  CHECK_OR_FALSE(ifm == ifm_should_be);
-  CHECK_OR_FALSE(mean_of_ifm_2 == mean_of_ifm_2_should_be);
+  CHECK_OR_FALSE(condition_common_3_4());
 
   assert(const_as_gamma == nullptr);
   assert(const_as_beta == nullptr);
@@ -612,30 +593,7 @@ template <> bool InstanceNormPattern::match<InstanceNormPattern::PatternVersion:
   CHECK_OR_FALSE(ifm_circle->dim(3).known());
   uint32_t ifm_channel_depth = ifm_circle->dim(3).value();
 
-  add_as_variance = dynamic_cast<luci::CircleAdd *>(rsqrt->x());
-  CHECK_OR_FALSE(add_as_variance);
-
-  CHECK_OR_FALSE(
-    luci::fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
-
-  CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
-  // TODO Support regarding broadcast
-  CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
-
-  CHECK_OR_FALSE(is_instance_mean_v1(mean_as_variance));
-
-  sqdiff = dynamic_cast<luci::CircleSquaredDifference *>(mean_as_variance->input());
-  CHECK_OR_FALSE(sqdiff);
-
-  loco::Node *ifm_should_be = nullptr;
-  CHECK_OR_FALSE(luci::fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff));
-  CHECK_OR_FALSE(ifm == ifm_should_be);
-  CHECK_OR_FALSE(is_instance_mean_v1(mean_of_ifm));
-  CHECK_OR_FALSE(ifm == mean_of_ifm->input());
-
-  const_as_beta = dynamic_cast<luci::CircleConst *>(sub->x());
-  CHECK_OR_FALSE(const_as_beta);
-  CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth));
+  CHECK_OR_FALSE(condition_common_1_5(ifm_channel_depth));
 
   luci::CircleRsqrt *rsqrt_should_be = nullptr;
   luci::CircleMean *mean_of_ifm_should_be = nullptr;
diff --git a/compiler/luci/pass/src/PropagateQParamBackwardPass.cpp b/compiler/luci/pass/src/PropagateQParamBackwardPass.cpp
index b4975486d..e8fa2a478 100644
--- a/compiler/luci/pass/src/PropagateQParamBackwardPass.cpp
+++ b/compiler/luci/pass/src/PropagateQParamBackwardPass.cpp
@@ -23,6 +23,7 @@
 #include <luci/Log.h>
 
 #include <cmath>
+#include <limits>
 
 namespace
 {
diff --git a/compiler/luci/pass/src/PropagateQParamForwardPass.cpp b/compiler/luci/pass/src/PropagateQParamForwardPass.cpp
index 003e4c293..aaadb2864 100644
--- a/compiler/luci/pass/src/PropagateQParamForwardPass.cpp
+++ b/compiler/luci/pass/src/PropagateQParamForwardPass.cpp
@@ -138,13 +138,18 @@ struct PropagateQParamForward final : public luci::CircleNodeMutableVisitor<bool
     auto qtype = luci::activation_qtype(input_node);
     switch (qtype)
     {
-      case luci::ActivationQType::PreDefinedValue:
-        node->quantparam(luci::make_predefined_qparam(input_node->opcode(), node->dtype()));
+      case luci::ActivationQType::PreDefinedLogistic:
+      case luci::ActivationQType::PreDefinedTanh:
+      case luci::ActivationQType::PreDefinedSoftmax:
+        node->quantparam(luci::make_predefined_qparam(qtype, node->dtype()));
         break;
       case luci::ActivationQType::IntScale:
         luci::set_int_scale(node);
         break;
       default:
+        // This assert ensures this switch-satement handles all ActivationQTypes
+        // TODO Find a better design to remove coupling with ActivationQType
+        assert(qtype == luci::ActivationQType::MinMax);
         break;
     }
 
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index ad86cedf4..06a4ae9f6 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -20,6 +20,7 @@
 
 #include <iostream>
 #include <cmath>
+#include <limits>
 
 namespace luci
 {
@@ -276,31 +277,70 @@ uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices)
          indices[2] * dimension.dim(3).value() + indices[3];
 }
 
+// Activation (ofm) qtype is determined in different ways.
+// 1. Pre-defined values: Some Ops have pre-defined qparams (ex: LOGISTIC, TANH)
+// 2. Integer scale: Output of some Ops should be integers (ex: FLOOR, CEIL)
+// 3. Activation qtype of input: Some Ops propagate qparam from input to output (ex: QUANTIZE,
+// TRANSPOSE, etc. See PropagateQParamForwardPass.cpp for more details).
 ActivationQType activation_qtype(const CircleNode *node)
 {
   auto fused_act_node = dynamic_cast<const CircleNodeMixin<CircleNodeTrait::FusedActFunc> *>(node);
   if (fused_act_node && fused_act_node->fusedActivationFunction() == FusedActFunc::TANH)
-    return ActivationQType::PreDefinedValue;
+    return ActivationQType::PreDefinedTanh;
+
+#define RETURN_INPUT_ACTIVATION_QTYPE(CLASS, INPUT)         \
+  {                                                         \
+    auto n = loco::must_cast<const CLASS *>(node);          \
+    auto input = loco::must_cast<CircleNode *>(n->INPUT()); \
+    return activation_qtype(input);                         \
+  }
 
   switch (node->opcode())
   {
     case CircleOpcode::LOGISTIC:
+      return ActivationQType::PreDefinedLogistic;
     case CircleOpcode::TANH:
+      return ActivationQType::PreDefinedTanh;
     case CircleOpcode::SOFTMAX:
-      return ActivationQType::PreDefinedValue;
+      return ActivationQType::PreDefinedSoftmax;
     case CircleOpcode::FLOOR:
     case CircleOpcode::FLOOR_DIV:
     case CircleOpcode::FLOOR_MOD:
     case CircleOpcode::CEIL:
       return ActivationQType::IntScale;
+    case CircleOpcode::GATHER:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleGather, params);
+    case CircleOpcode::RESHAPE:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleReshape, tensor);
+    case CircleOpcode::TRANSPOSE:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleTranspose, a);
+    case CircleOpcode::STRIDED_SLICE:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleStridedSlice, input);
+    case CircleOpcode::SPLIT:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleSplit, input);
+    case CircleOpcode::CIRCLESPLITOUT:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleSplitOut, input);
+    case CircleOpcode::SPLIT_V:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleSplitV, input);
+    case CircleOpcode::CIRCLESPLITVOUT:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleSplitVOut, input);
+    case CircleOpcode::UNPACK:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleUnpack, value);
+    case CircleOpcode::CIRCLEUNPACKOUT:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleUnpackOut, input);
+    case CircleOpcode::QUANTIZE:
+      RETURN_INPUT_ACTIVATION_QTYPE(CircleQuantize, input);
     default:
       break;
   }
 
+#undef RETURN_INPUT_ACTIVATION_QTYPE
+
   return ActivationQType::MinMax;
 }
 
-std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, loco::DataType dtype)
+std::unique_ptr<CircleQuantParam> make_predefined_qparam(ActivationQType qtype,
+                                                         loco::DataType dtype)
 {
   auto qparam = std::make_unique<CircleQuantParam>();
 
@@ -309,9 +349,9 @@ std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, lo
     qparam->zerop.emplace_back(zp);
   };
 
-  switch (opcode)
+  switch (qtype)
   {
-    case CircleOpcode::LOGISTIC:
+    case ActivationQType::PreDefinedLogistic:
       if (dtype == loco::DataType::U8)
         set_qparam(1.0f / 256.0f, 0);
       else
@@ -320,7 +360,7 @@ std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, lo
         set_qparam(1.0f / 32768.0f, 0);
       }
       break;
-    case CircleOpcode::TANH:
+    case ActivationQType::PreDefinedTanh:
       if (dtype == loco::DataType::U8)
         set_qparam(2.0f / 256.0f, 128);
       else
@@ -329,7 +369,7 @@ std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, lo
         set_qparam(1.0f / 32768.0f, 0);
       }
       break;
-    case CircleOpcode::SOFTMAX:
+    case ActivationQType::PreDefinedSoftmax:
       if (dtype == loco::DataType::U8)
         set_qparam(1.0f / 255.0f, 0);
       else
@@ -341,7 +381,7 @@ std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, lo
     default:
       throw std::runtime_error("Unsupported opcode with pre-defined qparam");
   }
-  return std::move(qparam);
+  return qparam;
 }
 
 // For nodes with integer output, we use integer scale
@@ -395,4 +435,74 @@ void quant_const(luci::CircleConst *node, loco::DataType quant_type)
   node->quantparam(std::move(quantparam));
 }
 
+namespace
+{
+
+// TODO move this to a more global helper file
+int nbits(loco::DataType dt) noexcept
+{
+  switch (dt)
+  {
+    case loco::DataType::S8:
+    case loco::DataType::U8:
+      return 8;
+    case loco::DataType::S16:
+    case loco::DataType::U16:
+    case loco::DataType::FLOAT16:
+      return 16;
+    case loco::DataType::S32:
+    case loco::DataType::U32:
+    case loco::DataType::FLOAT32:
+      return 32;
+    case loco::DataType::S64:
+      return 64;
+    default:
+      return 64; // a safe large default
+  }
+}
+
+// TODO Check if the metric is valid
+// Returns true if [min,max] is poorly representable
+bool range_check(float min, float max, loco::DataType dtype)
+{
+  float thresh = 1.5f;
+  return log2f(max) - log2f(min) > nbits(dtype) * thresh;
+}
+
+bool warn_scale_zp(float scale, int64_t zp, luci::CircleNode *n)
+{
+  float min, max;
+  // estimate min/max
+  switch (n->dtype())
+  {
+    case loco::DataType::U8:
+      min = scale * (0 - zp);
+      max = scale * (255 - zp);
+      break;
+    case loco::DataType::S16:
+      min = scale * (-32767);
+      max = scale * (32767);
+      break;
+    default:
+      return false;
+  }
+  return range_check(min, max, n->dtype());
+}
+
+} // namespace
+
+void warn_accuracy_with_range(luci::CircleNode *n)
+{
+  LOGGER(l);
+  auto qp = n->quantparam();
+  auto k = qp->zerop.size();
+  for (uint32_t i = 0; i < k; i++)
+  {
+    if (warn_scale_zp(qp->scale[i], qp->zerop[i], n))
+      WARN(l) << "Quantization of " << i << "-th channel of " << n->name()
+              << "'s quantization may cause accuracy issues" << std::endl;
+    ;
+  }
+}
+
 } // namespace luci
diff --git a/compiler/luci/pass/src/QuantizationUtils.h b/compiler/luci/pass/src/QuantizationUtils.h
index cd8cec95a..4d5316ccb 100644
--- a/compiler/luci/pass/src/QuantizationUtils.h
+++ b/compiler/luci/pass/src/QuantizationUtils.h
@@ -62,15 +62,19 @@ bool is_quantized(const CircleNode *node);
 
 enum ActivationQType
 {
-  MinMax,          // Quantize using recorded min/max
-  PreDefinedValue, // Quantize using pre-defined values
-  IntScale,        // Round scale to a positive integer
+  MinMax,             // Quantize using recorded min/max
+  PreDefinedLogistic, // Quantize using pre-defined values
+  PreDefinedTanh,     // Quantize using pre-defined values
+  PreDefinedSoftmax,  // Quantize using pre-defined values
+  IntScale,           // Round scale to a positive integer
 };
 
 ActivationQType activation_qtype(const CircleNode *node);
 
 // Create qparam with pre-defined values for speical operators
-std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleOpcode opcode, loco::DataType dtype);
+std::unique_ptr<CircleQuantParam> make_predefined_qparam(CircleNode *node, loco::DataType dtype);
+std::unique_ptr<CircleQuantParam> make_predefined_qparam(ActivationQType qtype,
+                                                         loco::DataType dtype);
 
 // Update node's scale to a positive integer (for special Ops e.g., Floor, Ceil)
 void set_int_scale(luci::CircleNode *node);
@@ -78,6 +82,10 @@ void set_int_scale(luci::CircleNode *node);
 // Quantize const tensor using its min/max values
 void quant_const(luci::CircleConst *node, loco::DataType quant_type);
 
+// Check that a node is quantized without significant loss of precision;
+// Emits warnings to log with WARN
+void warn_accuracy_with_range(luci::CircleNode *n);
+
 } // namespace luci
 
 #endif // __LUCI_QUANTIZATION_UTILS_H__
diff --git a/compiler/luci/pass/src/QuantizeActivation.cpp b/compiler/luci/pass/src/QuantizeActivation.cpp
index 149331824..95251a82c 100644
--- a/compiler/luci/pass/src/QuantizeActivation.cpp
+++ b/compiler/luci/pass/src/QuantizeActivation.cpp
@@ -114,29 +114,26 @@ void QuantizeSpecialActivation::visit(luci::CircleNode *node)
   auto fused_act_node = dynamic_cast<CircleNodeMixin<CircleNodeTrait::FusedActFunc> *>(node);
   if (fused_act_node != nullptr && fused_act_node->fusedActivationFunction() == FusedActFunc::TANH)
   {
-    auto qparam = make_predefined_qparam(luci::CircleOpcode::TANH, output_type);
+    auto qparam = make_predefined_qparam(luci::ActivationQType::PreDefinedTanh, output_type);
     node->quantparam(std::move(qparam));
   }
 }
 
 void QuantizeSpecialActivation::visit(luci::CircleLogistic *node)
 {
-  assert(activation_qtype(node) == luci::ActivationQType::PreDefinedValue);
-  auto qparam = make_predefined_qparam(luci::CircleOpcode::LOGISTIC, output_type);
+  auto qparam = make_predefined_qparam(luci::ActivationQType::PreDefinedLogistic, output_type);
   node->quantparam(std::move(qparam));
 }
 
 void QuantizeSpecialActivation::visit(luci::CircleTanh *node)
 {
-  assert(activation_qtype(node) == luci::ActivationQType::PreDefinedValue);
-  auto qparam = make_predefined_qparam(luci::CircleOpcode::TANH, output_type);
+  auto qparam = make_predefined_qparam(luci::ActivationQType::PreDefinedTanh, output_type);
   node->quantparam(std::move(qparam));
 }
 
 void QuantizeSpecialActivation::visit(luci::CircleSoftmax *node)
 {
-  assert(activation_qtype(node) == luci::ActivationQType::PreDefinedValue);
-  auto qparam = make_predefined_qparam(luci::CircleOpcode::SOFTMAX, output_type);
+  auto qparam = make_predefined_qparam(luci::ActivationQType::PreDefinedSoftmax, output_type);
   node->quantparam(std::move(qparam));
 }
 
diff --git a/compiler/luci/pass/src/QuantizeBias.cpp b/compiler/luci/pass/src/QuantizeBias.cpp
index aa496232a..de97a14dd 100644
--- a/compiler/luci/pass/src/QuantizeBias.cpp
+++ b/compiler/luci/pass/src/QuantizeBias.cpp
@@ -22,6 +22,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <limits>
 
 using namespace luci;
 
@@ -201,6 +202,18 @@ CircleConst *QuantizeBias::quantized_bias(CircleNode *input, const CircleNode *w
     std::vector<float> scaling_factor(size);
     std::vector<int64_t> zp(size);
 
+    if (const_bias->rank() == 0)
+    {
+      // TODO Support quantization of scalar bias
+      throw std::runtime_error("Quantization of scalar bias is not yet supported (" +
+                               const_bias->name() + ")");
+    }
+    if (size != const_bias->dim(const_bias->rank() - 1).value())
+    {
+      throw std::runtime_error(const_bias->name() +
+                               " (bias) should have the shape of [1, 1, .. 1, channel]");
+    }
+
     if (output_type == loco::DataType::U8)
     {
       new_bias = quant_bias_per_channel(const_bias, input_scale, weight_scale, scaling_factor, zp);
@@ -218,6 +231,7 @@ CircleConst *QuantizeBias::quantized_bias(CircleNode *input, const CircleNode *w
     auto quantparam = std::make_unique<CircleQuantParam>();
     quantparam->scale = scaling_factor;
     quantparam->zerop = zp;
+    quantparam->quantized_dimension = const_bias->rank() - 1;
     assert(new_bias->quantparam() == nullptr); // bias should not be quantized before
     new_bias->quantparam(std::move(quantparam));
 
diff --git a/compiler/luci/pass/src/QuantizeBias.test.cpp b/compiler/luci/pass/src/QuantizeBias.test.cpp
new file mode 100644
index 000000000..0104a191b
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizeBias.test.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizeBias.h"
+
+#include <luci/test/TestIOGraph.h>
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleQuantParam.h>
+
+#include <gtest/gtest.h>
+
+using namespace luci;
+
+namespace
+{
+
+using namespace luci::test;
+
+// TODO Reduce duplicate codes in ResolveCustomOpMatMulPass.cpp
+template <typename T>
+luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
+                                     const std::vector<uint32_t> &shape, T value)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  node->dtype(dtype);
+  node->rank(shape.size());
+
+  uint32_t size = 1;
+  for (uint32_t i = 0; i < shape.size(); ++i)
+  {
+    node->dim(i) = shape.at(i);
+    size *= shape.at(i);
+  }
+  node->shape_status(luci::ShapeStatus::VALID);
+
+#define INIT_VALUES(DT)                 \
+  {                                     \
+    node->size<DT>(size);               \
+    for (uint32_t i = 0; i < size; ++i) \
+      node->at<DT>(i) = value;          \
+  }
+
+  switch (dtype)
+  {
+    case loco::DataType::U8:
+      INIT_VALUES(loco::DataType::U8);
+      break;
+    case loco::DataType::S16:
+      INIT_VALUES(loco::DataType::S16);
+      break;
+    case loco::DataType::S32:
+      INIT_VALUES(loco::DataType::S32);
+      break;
+    case loco::DataType::FLOAT32:
+      INIT_VALUES(loco::DataType::FLOAT32)
+      break;
+    default:
+      INTERNAL_EXN("create_const_node called with unsupported type");
+      break;
+  }
+  return node;
+}
+
+/**
+ *  Simple graph for test
+ *
+ *  BEFORE
+ *
+ *   [IFM] [WEIGHTS] [BIAS(FP32)]
+ *        \   |     /
+ *           [FC]
+ *            |
+ *          [OFM]
+ *
+ *  AFTER
+ *
+ *   [IFM] [WEIGHTS] [BIAS(Quantized)]
+ *        \   |     /
+ *           [FC]
+ *            |
+ *          [OFM]
+ */
+struct Q8FCGraphlet
+{
+public:
+  Q8FCGraphlet() = default;
+  virtual ~Q8FCGraphlet() = default;
+
+  void init(loco::Graph *g, const ShapeU32 out_shape, const ShapeU32 w_shape,
+            const ShapeU32 bias_shape, const float bv)
+  {
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->input(_x);
+    _x->dtype(loco::DataType::U8);
+    {
+      auto quantparam = std::make_unique<CircleQuantParam>();
+      quantparam->scale.push_back(1.0);
+      quantparam->zerop.push_back(0);
+      quantparam->quantized_dimension = 0;
+      _x->quantparam(std::move(quantparam));
+    }
+
+    auto weights = create_const_node<uint8_t>(g, loco::DataType::U8, w_shape, 1.0);
+    auto w_qparam = std::make_unique<CircleQuantParam>();
+    std::vector<float> w_scale(weights->dim(0).value(), 1.0);
+    std::vector<int64_t> w_zp(weights->dim(0).value(), 0);
+    w_qparam->scale = w_scale;
+    w_qparam->zerop = w_zp;
+    w_qparam->quantized_dimension = 0;
+    weights->quantparam(std::move(w_qparam));
+    _fc->weights(weights);
+    _fc->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _fc->dtype(loco::DataType::U8);
+    _fc->shape(out_shape);
+    auto l = _fc->dim(_fc->rank() - 1).value();
+    _fc->bias(create_const_node(g, loco::DataType::FLOAT32, bias_shape, bv));
+    _fc->name("fc");
+    {
+      auto quantparam = std::make_unique<CircleQuantParam>();
+      quantparam->scale.push_back(1.0);
+      quantparam->zerop.push_back(0);
+      quantparam->quantized_dimension = 0;
+      _fc->quantparam(std::move(quantparam));
+    }
+  }
+
+public:
+  luci::CircleFullyConnected *fc() { return _fc; }
+
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleInput *_x = nullptr;
+};
+
+struct Q8FCGraph final : public TestIGraphlet, public TestOGraphlet, public Q8FCGraphlet
+{
+  void init(const ShapeU32 in_shape, const ShapeU32 w_shape, const ShapeU32 out_shape,
+            const ShapeU32 bias_shape, const float bv)
+  {
+    TestIGraphlet::init(g(), in_shape);
+    TestOGraphlet::init(g(), out_shape);
+    _x = input();
+    Q8FCGraphlet::init(g(), out_shape, w_shape, bias_shape, bv);
+    output()->from(_fc);
+  }
+};
+
+class CQ8QuantizeBiasFCTest : public ::testing::Test
+{
+public:
+  Q8FCGraph g;
+  luci::QuantizeBias qb{loco::DataType::FLOAT32, loco::DataType::U8,
+                        luci::QuantizationGranularity::ChannelWise};
+};
+
+} // namespace
+
+TEST_F(CQ8QuantizeBiasFCTest, fully_connected)
+{
+  g.init({1, 18, 80}, {256, 80}, {18, 256}, {1, 256}, 1);
+  g.fc()->accept(&qb);
+
+  auto bias = loco::must_cast<CircleConst *>(g.fc()->bias());
+  auto qparam = bias->quantparam();
+
+  EXPECT_NE(nullptr, qparam);
+  EXPECT_EQ(256, qparam->scale.size());
+  EXPECT_EQ(256, qparam->zerop.size());
+  EXPECT_EQ(1, qparam->quantized_dimension);
+}
+
+TEST_F(CQ8QuantizeBiasFCTest, wrong_bias_shape_NEG)
+{
+  g.init({1, 18, 80}, {256, 80}, {18, 256}, {1, 2, 128}, 1);
+  EXPECT_ANY_THROW(g.fc()->accept(&qb)); // Wrong bias shape
+}
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
index c9b35e0be..ef047d35d 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
@@ -27,6 +27,7 @@
 #include <iostream>
 #include <cmath>
 #include <functional>
+#include <limits>
 
 namespace
 {
@@ -352,15 +353,15 @@ private:
 private:
   // Check if
   // 1. node is const
-  // 2. node was not quantized
+  // 2. node's dtype is float32
   bool is_quantizable(loco::Node *node)
   {
     auto const_node = dynamic_cast<luci::CircleConst *>(node);
     if (not const_node)
       return false;
 
-    // Skip if this is already quantized
-    if (is_quantized(const_node))
+    // Skip if this is not float32
+    if (const_node->dtype() != loco::DataType::FLOAT32)
       return false;
 
     return true;
diff --git a/compiler/luci/pass/src/QuantizeWeights.cpp b/compiler/luci/pass/src/QuantizeWeights.cpp
index 11322ab44..500ae12ed 100644
--- a/compiler/luci/pass/src/QuantizeWeights.cpp
+++ b/compiler/luci/pass/src/QuantizeWeights.cpp
@@ -23,6 +23,7 @@
 #include <cmath>
 #include <vector>
 #include <functional>
+#include <limits>
 
 using namespace luci;
 
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index d9a9d4db7..005144516 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -41,10 +41,28 @@ namespace
 {
 
 using namespace luci;
+
+bool use_predefined_values(ActivationQType qtype)
+{
+  switch (qtype)
+  {
+    case ActivationQType::PreDefinedLogistic:
+    case ActivationQType::PreDefinedTanh:
+    case ActivationQType::PreDefinedSoftmax:
+      return true;
+    default:
+      // This ensures this switch-statement handles all ActivationQTypes
+      assert(qtype == ActivationQType::IntScale or qtype == ActivationQType::MinMax);
+      break;
+  }
+
+  return false;
+}
+
 // Create a Quantize Op whose
 // dtype is out_type
 // shape is the same with node
-// qparam is computed using node's min/max
+// qparam is computed according to node's qtype
 luci::CircleQuantize *create_quantize_op(luci::CircleNode *node, loco::DataType out_type)
 {
   auto quantize = node->graph()->nodes()->create<CircleQuantize>();
@@ -60,9 +78,9 @@ luci::CircleQuantize *create_quantize_op(luci::CircleNode *node, loco::DataType
   assert(qparam); // FIX_CALLER_UNLESS
 
   auto qtype = luci::activation_qtype(node);
-  if (qtype == ActivationQType::PreDefinedValue)
+  if (use_predefined_values(qtype))
   {
-    quantize->quantparam(luci::make_predefined_qparam(node->opcode(), out_type));
+    quantize->quantparam(luci::make_predefined_qparam(qtype, out_type));
     return quantize;
   }
 
@@ -105,6 +123,23 @@ luci::CircleQuantize *create_quantize_op(luci::CircleNode *node, loco::DataType
   return quantize;
 }
 
+// Create Dequantize Op whose shape is the same with node
+luci::CircleDequantize *create_dequantize(luci::CircleNode *node)
+{
+  auto dequantize = node->graph()->nodes()->create<luci::CircleDequantize>();
+  dequantize->name(node->name() + "_Dequantize");
+  dequantize->dtype(loco::DataType::FLOAT32);
+  dequantize->rank(node->rank());
+  for (uint32_t i = 0; i < node->rank(); i++)
+    dequantize->dim(i).set(node->dim(i).value());
+
+  dequantize->shape_status(luci::ShapeStatus::VALID);
+
+  luci::add_origin(dequantize, luci::get_origin(node));
+
+  return dequantize;
+}
+
 } // namespace
 
 namespace luci
@@ -229,11 +264,13 @@ private:
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleFullyConnected, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleGather, params)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleInstanceNorm, input)
+  INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleLeakyRelu, features)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleLocalResponseNormalization, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleLogistic, x)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleMaxPool2D, value)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleMean, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleMirrorPad, input)
+  INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleNeg, x)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CirclePad, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CirclePadV2, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CirclePRelu, input)
@@ -241,6 +278,7 @@ private:
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleReduceMax, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleReduceMin, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRelu, features)
+  INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRelu6, features)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleReshape, tensor)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeBilinear, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeNearestNeighbor, input)
@@ -250,6 +288,7 @@ private:
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSoftmax, logits)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSpaceToBatchND, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSpaceToDepth, input)
+  INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSqueeze, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSqrt, x)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleStridedSlice, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSum, input)
@@ -353,7 +392,9 @@ void QuantizeWithMinMaxPass::set_input_type(loco::Graph *g) const
       luci::add_origin(quant_op, luci::get_origin(succ));
     }
 
-    // Requantize input
+    // Update qparam of input
+    // This step is skipped if input_type is float32
+    if (_ctx->input_type != loco::DataType::FLOAT32)
     {
       auto quantparam = input->quantparam();
       assert(quantparam);
@@ -376,11 +417,13 @@ void QuantizeWithMinMaxPass::set_input_type(loco::Graph *g) const
         assert(_ctx->input_type == loco::DataType::S16);
         compute_sym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
       }
-      input->dtype(_ctx->input_type);
       input->quantparam()->scale[0] = scaling_factor;
       input->quantparam()->zerop[0] = zp;
     }
 
+    // Update dtype of input
+    input->dtype(_ctx->input_type);
+
     auto graph_input = inputs->at(input->index());
     graph_input->dtype(_ctx->input_type);
   }
@@ -405,13 +448,26 @@ void QuantizeWithMinMaxPass::set_output_type(loco::Graph *g) const
     if (not from->quantparam())
       continue;
 
-    // Insert Quantize Op
-    auto quant_op = create_quantize_op(from, _ctx->output_type);
-    loco::replace(from).with(quant_op);
-    quant_op->input(from);
+    // Insert Dequantize Op for float32 output_type
+    if (_ctx->output_type == loco::DataType::FLOAT32)
+    {
+      auto dequant_op = create_dequantize(from);
+      loco::replace(from).with(dequant_op);
+      dequant_op->input(from);
+    }
+    else
+    {
+      // Insert Quantize Op for non-float32 output_type
+      auto quant_op = create_quantize_op(from, _ctx->output_type);
+      loco::replace(from).with(quant_op);
+      quant_op->input(from);
 
-    // TODO Set a proper origin (Quantize should have its own Origin)
-    luci::add_origin(quant_op, luci::get_origin(from));
+      // TODO Set a proper origin (Quantize should have its own Origin)
+      luci::add_origin(quant_op, luci::get_origin(from));
+    }
+
+    // Update dtype of output
+    output->dtype(_ctx->output_type);
 
     auto graph_output = outputs->at(output->index());
     graph_output->dtype(_ctx->output_type);
@@ -594,12 +650,25 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   // Set output type
   set_output_type(g);
 
+  // Remove redundant Quantize Op
+  {
+    logo::Phase phase;
+
+    phase.emplace_back(std::make_unique<luci::RemoveRedundantQuantizePass>());
+
+    ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
+    logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
+    phase_runner.attach(&prog);
+    phase_runner.run(phase);
+  }
+
   // Remove min/max values
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     if (auto qparam = circle_node->quantparam())
     {
+      warn_accuracy_with_range(circle_node);
       qparam->min.clear();
       qparam->max.clear();
     }
diff --git a/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
index cebafd32b..21b4fe1c6 100644
--- a/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
+++ b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
@@ -1088,6 +1088,31 @@ private:
   luci::CircleConst *_const = nullptr;
 };
 
+class ReduceMaxTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({4, 3, 2}, {2});
+
+    _axis = create_const<Type::S32, int32_t>(g(), {4}, {1, 0, -3, -3});
+    _reduce_max = g()->nodes()->create<luci::CircleReduceMax>();
+    {
+      _reduce_max->input(input());
+      _reduce_max->reduction_indices(_axis);
+      _reduce_max->name("test");
+      _reduce_max->keep_dims(false);
+    }
+    output()->from(_reduce_max);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+private:
+  luci::CircleReduceMax *_reduce_max = nullptr;
+  luci::CircleConst *_axis = nullptr;
+};
+
 class ResizeBilinearTestGraph final : public SimpleTestGraph
 {
 public:
@@ -2345,6 +2370,34 @@ TEST(QuantizedModelVerifierTest, Pow_wrong_granularity_NEG)
   SUCCEED();
 }
 
+TEST(QuantizedModelVerifierTest, ReduceMax)
+{
+  TEST_WITH_GRAPH(ReduceMaxTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ReduceMaxTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ReduceMaxTestGraph, Type::S16, Granularity::ChannelWise);
+
+  TEST_WITH_LAYER_INFO(ReduceMaxTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_LAYER_INFO(ReduceMaxTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_LAYER_INFO(ReduceMaxTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ReduceMax_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ReduceMaxTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ReduceMaxTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ReduceMaxTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ReduceMax_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(ReduceMaxTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(ReduceMaxTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(ReduceMaxTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
 TEST(QuantizedModelVerifierTest, ResizeBilinear)
 {
   TEST_WITH_GRAPH(ResizeBilinearTestGraph, Type::U8, Granularity::LayerWise);
diff --git a/compiler/luci/pass/src/RemoveRedundantDequantizePass.cpp b/compiler/luci/pass/src/RemoveRedundantDequantizePass.cpp
new file mode 100644
index 000000000..66cd9d791
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantDequantizePass.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveRedundantDequantizePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool remove_redundant_dequant(luci::CircleDequantize *dequant)
+{
+  assert(dequant != nullptr);
+
+  auto prev = loco::must_cast<luci::CircleNode *>(dequant->input());
+  if (prev->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  replace(dequant).with(prev);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+/**
+ * Dequantize Op does the below things on the ifm.
+ * 1. Element-wise update of quantized values (u8/s16) to fp32 values
+ * 2. Update dtype to fp32
+ * If the previous node is not quantized, dequantize Op is redundant.
+ *
+ * BEFORE
+ *
+ *     [CircleNode (A)]
+ *            |
+ *     [CircleNode (B)] (fp32)
+ *            |
+ *    [CircleDequantize]
+ *            |
+ *       [CircleNode]
+ *
+ * AFTER
+ *
+ *     [CircleNode (A)]
+ *            |
+ *     [CircleNode (B)] (fp32)
+ *            |
+ *       [CircleNode]
+ */
+bool RemoveRedundantDequantizePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto target_node = dynamic_cast<luci::CircleDequantize *>(node);
+    if (target_node != nullptr)
+    {
+      if (remove_redundant_dequant(target_node))
+        changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveRedundantDequantizePass.test.cpp b/compiler/luci/pass/src/RemoveRedundantDequantizePass.test.cpp
new file mode 100644
index 000000000..adb2f14a4
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantDequantizePass.test.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveRedundantDequantizePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class DequantizeGraphlet
+{
+public:
+  DequantizeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    _dequantize = g->nodes()->create<luci::CircleDequantize>();
+    _dequantize->dtype(loco::DataType::FLOAT32);
+    _dequantize->name("dequantize");
+  }
+
+protected:
+  luci::CircleDequantize *_dequantize = nullptr;
+};
+
+class RedundantDequantizeGraph : public TestIOGraph, public DequantizeGraphlet
+{
+public:
+  RedundantDequantizeGraph() = default;
+
+public:
+  void init(void)
+  {
+    TestIOGraph::init({1}, {1});
+    DequantizeGraphlet::init(g());
+
+    _dequantize->input(input());
+
+    output()->from(_dequantize);
+  }
+
+  void init_u8_input(void)
+  {
+    TestIOGraph::init({1}, {1});
+    DequantizeGraphlet::init(g());
+
+    // Use u8 input (dequantize is not redundant anymore)
+    input()->dtype(loco::DataType::U8);
+    {
+      auto qparam = std::make_unique<luci::CircleQuantParam>();
+      qparam->scale = {1};
+      qparam->zerop = {1};
+      input()->quantparam(std::move(qparam));
+    }
+
+    _dequantize->input(input());
+
+    output()->from(_dequantize);
+  }
+};
+
+} // namespace
+
+TEST(RemoveRedundantDequantizePass, single_redundant_dequantize)
+{
+  RedundantDequantizeGraph g;
+  luci::RemoveRedundantDequantizePass pass;
+
+  g.init();
+
+  EXPECT_TRUE(pass.run(g.g()));
+
+  int count = 0;
+  for (auto node : loco::active_nodes(loco::output_nodes(g.g())))
+  {
+    if (dynamic_cast<luci::CircleDequantize *>(node))
+    {
+      count++;
+    }
+  }
+
+  ASSERT_EQ(0, count);
+}
+
+TEST(RemoveRedundantDequantizePass, wrong_dtype_NEG)
+{
+  RedundantDequantizeGraph g;
+  luci::RemoveRedundantDequantizePass pass;
+
+  g.init_u8_input();
+
+  EXPECT_FALSE(pass.run(g.g()));
+}
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.cpp b/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.cpp
new file mode 100644
index 000000000..476ec68bf
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessaryReshapeNetPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool acceptable_intermediate_op(const loco::Node *node)
+{
+  if (not node)
+    return false;
+
+  const auto opcode = loco::must_cast<const luci::CircleNode *>(node)->opcode();
+
+  switch (opcode)
+  {
+    case luci::CircleOpcode::ADD:
+    case luci::CircleOpcode::MUL:
+    case luci::CircleOpcode::TANH:
+    case luci::CircleOpcode::LOGISTIC:
+      break;
+
+    default:
+      return false;
+  }
+
+  return true;
+}
+
+bool same_shape(const loco::Node *a, const loco::Node *b)
+{
+  auto a_cnode = loco::must_cast<const luci::CircleNode *>(a);
+  auto b_cnode = loco::must_cast<const luci::CircleNode *>(b);
+
+  if (a_cnode->rank() != b_cnode->rank())
+    return false;
+
+  for (uint32_t i = 0; i < a_cnode->rank(); i++)
+  {
+    if (not(a_cnode->dim(i) == b_cnode->dim(i)))
+      return false;
+  }
+  return true;
+}
+
+class PreReshapeFinder
+{
+public:
+  PreReshapeFinder(const luci::CircleReshape *post_reshape) : _post_reshape(post_reshape)
+  {
+    assert(post_reshape != nullptr); // FIX_CALLER_UNLESS
+  }
+
+public:
+  // Return true if pre_reshapes are found
+  bool collect_pre_reshapes(loco::Node *node)
+  {
+    // TODO Support diamond case
+    if (loco::succs(node).size() != 1)
+      return false;
+
+    if (auto pre_reshape = dynamic_cast<luci::CircleReshape *>(node))
+    {
+      // Check ifm of pre-reshape and ofm of post_reshape
+      if (not same_shape(pre_reshape->tensor(), _post_reshape))
+        return false;
+
+      // Check ofm of pre-reshape and ifm of post_reshape
+      if (not same_shape(pre_reshape, _post_reshape->tensor()))
+        return false;
+
+      _pre_reshapes.emplace_back(pre_reshape);
+      return true;
+    }
+
+    if (not acceptable_intermediate_op(node))
+      return false;
+
+    for (uint32_t i = 0; i < node->arity(); i++)
+    {
+      if (not collect_pre_reshapes(node->arg(i)))
+        return false;
+    }
+
+    return true;
+  }
+
+public:
+  std::vector<luci::CircleReshape *> pre_reshapes(void) const { return _pre_reshapes; }
+
+private:
+  const luci::CircleReshape *_post_reshape = nullptr;
+  std::vector<luci::CircleReshape *> _pre_reshapes;
+};
+
+bool remove_unnecessary_reshape_net(luci::CircleReshape *reshape)
+{
+  PreReshapeFinder finder(reshape);
+  if (not finder.collect_pre_reshapes(reshape->tensor()))
+    return false;
+
+  // Remove pre_reshapes
+  for (auto pre_reshape : finder.pre_reshapes())
+  {
+    loco::replace(pre_reshape).with(pre_reshape->tensor());
+  }
+
+  // Remove post_reshape
+  loco::replace(reshape).with(reshape->tensor());
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *
+ *      [CircleNode]
+ *            |
+ *    [CircleReshape_1] (shape: A -> B)
+ *            |
+ *      [CircleNode] (ex: Add/Mul/Tanh/Logistic ..)
+ *            |
+ *    [CircleReshape_2] (shape: B -> A)
+ *            |
+ *      [CircleNode]
+ *
+ * AFTER
+ *
+ *      [CircleNode]
+ *            |   \
+ *            |   [CircleReshape_1]
+ *      [CircleNode]
+ *            |   \
+ *            |   [CircleReshape_2]
+ *      [CircleNode]
+ **/
+bool RemoveUnnecessaryReshapeNetPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(node))
+    {
+      if (remove_unnecessary_reshape_net(reshape_node))
+        changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.test.cpp b/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.test.cpp
new file mode 100644
index 000000000..4ad707ba3
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryReshapeNetPass.test.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveUnnecessaryReshapeNetPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+class RemoveUnnecessaryReshapeNet : public ::testing::Test
+{
+public:
+  RemoveUnnecessaryReshapeNet() {}
+
+  void createReshapeConst(luci::CircleReshape *target, const std::vector<uint32_t> shape)
+  {
+    auto shape_const = g.nodes()->create<luci::CircleConst>();
+    shape_const->dtype(loco::DataType::S32);
+    shape_const->size<loco::DataType::S32>(shape.size());
+    shape_const->shape_status(luci::ShapeStatus::VALID);
+    shape_const->rank(1);
+    shape_const->dim(0).set(shape.size());
+    for (int32_t i = 0; i < shape.size(); i++)
+    {
+      shape_const->at<loco::DataType::S32>(i) = static_cast<int32_t>(shape.at(i));
+    }
+    shape_const->name("shape_const");
+    target->shape(shape_const);
+    target->rank(shape.size());
+    for (uint32_t i = 0; i < shape.size(); i++)
+    {
+      target->dim(i) = shape[i];
+    }
+    target->shape_status(luci::ShapeStatus::VALID);
+  }
+
+  void buildGraph(const std::initializer_list<uint32_t> base_shape,
+                  const std::initializer_list<uint32_t> first_shape,
+                  const std::initializer_list<uint32_t> second_shape)
+  {
+    // Input Create.
+    input = g.nodes()->create<luci::CircleInput>();
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    input->shape_status(luci::ShapeStatus::VALID);
+    input->shape(base_shape);
+    input->name("input");
+
+    // Create first reshape.
+    first_reshape = g.nodes()->create<luci::CircleReshape>();
+    first_reshape->tensor(input);
+    first_reshape->name("Reshape");
+    createReshapeConst(first_reshape, first_shape);
+
+    // Create logistic.
+    logistic = g.nodes()->create<luci::CircleLogistic>();
+    logistic->x(first_reshape);
+    logistic->name("logistic");
+    logistic->shape(first_shape);
+    logistic->shape_status(luci::ShapeStatus::VALID);
+
+    // Create second reshape.
+    second_reshape = g.nodes()->create<luci::CircleReshape>();
+    second_reshape->tensor(logistic);
+    second_reshape->name("second_reshape");
+    createReshapeConst(second_reshape, second_shape);
+
+    // Output Connect.
+    output = g.nodes()->create<luci::CircleOutput>();
+    output->from(second_reshape);
+    output->name("output");
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+  }
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleReshape *first_reshape = nullptr;
+  luci::CircleLogistic *logistic = nullptr;
+  luci::CircleReshape *second_reshape = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+} // namespace
+
+TEST_F(RemoveUnnecessaryReshapeNet, simple_case)
+{
+  buildGraph({1, 1, 1, 32}, {1, 1, 32, 1}, {1, 1, 1, 32});
+  luci::RemoveUnnecessaryReshapeNetPass pass;
+
+  ASSERT_TRUE(pass.run(&g));
+
+  int count = 0;
+  for (auto node : loco::active_nodes(loco::output_nodes(&g)))
+  {
+    if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+      count++;
+  }
+  ASSERT_EQ(0, count);
+}
+
+TEST_F(RemoveUnnecessaryReshapeNet, shape_mismatch_NEG)
+{
+  buildGraph({1, 1, 1, 32}, {1, 1, 32, 1}, {1, 1, 2, 16});
+  luci::RemoveUnnecessaryReshapeNetPass pass;
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.cpp b/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.cpp
new file mode 100644
index 000000000..741b70956
--- /dev/null
+++ b/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h>
+
+namespace
+{
+
+// TODO move to global helper list if needed
+/**
+ * @brief Create a node with `inp` as input from fused activation fucntion `act`
+ */
+luci::CircleNode *fromActivation(luci::CircleNode *inp, luci::FusedActFunc act)
+{
+  switch (act)
+  {
+    case luci::FusedActFunc::NONE:
+      return inp;
+    case luci::FusedActFunc::RELU:
+    {
+      auto n = inp->graph()->nodes()->create<luci::CircleRelu>();
+      n->features(inp);
+      return n;
+    }
+    case luci::FusedActFunc::RELU6:
+    {
+      auto n = inp->graph()->nodes()->create<luci::CircleRelu6>();
+      n->features(inp);
+      return n;
+    }
+    case luci::FusedActFunc::RELU_N1_TO_1:
+    {
+      auto n = inp->graph()->nodes()->create<luci::CircleReluN1To1>();
+      n->features(inp);
+      return n;
+    }
+    case luci::FusedActFunc::TANH:
+    {
+      auto n = inp->graph()->nodes()->create<luci::CircleTanh>();
+      n->x(inp);
+      return n;
+    }
+    case luci::FusedActFunc::SIGN_BIT:
+    {
+      throw std::invalid_argument("no matching node to create from fused activation");
+    }
+    default:
+      throw std::invalid_argument("invalid fused activation");
+  }
+}
+
+/**
+ *  Replace Fully Connected with Batched MatMul
+ *
+ *  BEFORE
+ *
+ *         [Node1]         [Node2]
+ *           |               |
+ *       [transpose]?   [transpose]?
+ *               \        /
+ *            [FullyConnected]
+ *
+ *  AFTER
+ *
+ *              [Node1]  [Node2]
+ *                  \      /
+ *               [BatchMatMul] [BiasValue]?
+ *                        \       /
+ *                          [Add]?
+ *                            |
+ *                       [Activation]?
+ *
+ * Nodes with "?" denote optional elements
+ */
+bool replace_fc_with_matmul(luci::CircleFullyConnected *fc)
+{
+  luci::CircleNode *x = nullptr;
+  luci::CircleNode *y = nullptr;
+  luci::CircleNode *b = nullptr;
+  luci::CircleTranspose *ty = nullptr;
+  luci::CircleTranspose *tx = nullptr;
+  bool adj_x = false;
+  bool adj_y = true;
+
+  if (dynamic_cast<luci::CircleConst *>(fc->weights()))
+    return false; // NonConst
+
+  if ((ty = dynamic_cast<luci::CircleTranspose *>(fc->weights()))) // is y a transpose?
+  {
+    adj_y = false;
+    if (dynamic_cast<luci::CircleConst *>(ty->a()))
+      return false;
+    else
+      y = loco::must_cast<luci::CircleNode *>(ty->a());
+  }
+  else
+  { // y is not transpose and not const
+    y = loco::must_cast<luci::CircleNode *>(fc->weights());
+  }
+  if ((tx = dynamic_cast<luci::CircleTranspose *>(fc->input())))
+  {
+    adj_x = true;
+    x = loco::must_cast<luci::CircleNode *>(tx->a());
+  }
+  else
+  {
+    x = loco::must_cast<luci::CircleNode *>(fc->input());
+  }
+
+  b = loco::must_cast<luci::CircleNode *>(fc->bias());
+
+  if (x->dtype() != loco::DataType::FLOAT32 || y->dtype() != loco::DataType::FLOAT32 ||
+      b->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  auto name = fc->name();
+  assert(name.length() > 0);
+
+  auto matmul = fc->graph()->nodes()->create<luci::CircleBatchMatMul>();
+  matmul->x(x);
+  matmul->y(y);
+  matmul->adj_x(adj_x);
+  matmul->adj_y(adj_y);
+  matmul->name(name);
+  matmul->dtype(fc->dtype());
+
+  luci::add_origin(matmul, luci::get_origin(fc));
+
+  auto all_zero = [](const luci::CircleConst *c) {
+    bool ac = true;
+    for (uint32_t i = 0; i < c->size<loco::DataType::FLOAT32>() && ac; i++)
+    {
+      ac &= c->at<loco::DataType::FLOAT32>(i) == 0.0f;
+    }
+    return ac;
+  };
+
+  auto bc = dynamic_cast<luci::CircleConst *>(b);
+  if ((nullptr != bc) && !all_zero(bc))
+  {
+    auto bias_add = fc->graph()->nodes()->create<luci::CircleAdd>();
+    bias_add->x(matmul);
+    bias_add->y(b);
+    bias_add->name(fc->name() + "/bias_add");
+    bias_add->dtype(fc->dtype());
+    add_origin(bias_add, get_origin(fc));
+    bias_add->fusedActivationFunction(fc->fusedActivationFunction());
+    loco::replace(fc).with(bias_add);
+  }
+  else
+  {
+    auto n = fromActivation(matmul, fc->fusedActivationFunction());
+    add_origin(n, luci::get_origin(fc));
+    n->name(fc->name() + "fusedActivation");
+    n->dtype(fc->dtype());
+    loco::replace(fc).with(n);
+  }
+
+  return true;
+}
+} // namespace
+
+namespace luci
+{
+
+bool ReplaceNonConstFCWithBatchMatMulPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto fc = dynamic_cast<luci::CircleFullyConnected *>(node))
+    {
+      if (replace_fc_with_matmul(fc))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.test.cpp b/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.test.cpp
new file mode 100644
index 000000000..7606a6125
--- /dev/null
+++ b/compiler/luci/pass/src/ReplaceNonConstFCWithBatchMatMulPass.test.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ReplaceNonConstFCWithBatchMatMulPass.h"
+
+#include <luci/test/TestIOGraph.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+// TODO Reduce duplicate codes in ResolveCustomOpMatMulPass.cpp
+template <typename T>
+luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
+                                     const std::vector<uint32_t> &shape,
+                                     const std::vector<T> &values)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  node->dtype(dtype);
+  node->rank(shape.size());
+
+  uint32_t size = 1;
+  for (uint32_t i = 0; i < shape.size(); ++i)
+  {
+    node->dim(i) = shape.at(i);
+    size *= shape.at(i);
+  }
+  node->shape_status(luci::ShapeStatus::VALID);
+
+#define INIT_VALUES(DT)                          \
+  {                                              \
+    node->size<DT>(size);                        \
+    for (uint32_t i = 0; i < values.size(); ++i) \
+      node->at<DT>(i) = values[i];               \
+  }
+
+  switch (dtype)
+  {
+    case loco::DataType::U8:
+      INIT_VALUES(loco::DataType::U8);
+      break;
+    case loco::DataType::S16:
+      INIT_VALUES(loco::DataType::S16);
+      break;
+    case loco::DataType::S32:
+      INIT_VALUES(loco::DataType::S32);
+      break;
+    case loco::DataType::FLOAT32:
+      INIT_VALUES(loco::DataType::FLOAT32)
+      break;
+    default:
+      INTERNAL_EXN("create_const_node called with unsupported type");
+      break;
+  }
+  return node;
+}
+
+/**
+ *  Simple graph for test
+ *
+ *  BEFORE
+ *
+ *   [IFM1] [IFM2] [BIAS]
+ *        \   |   /
+ *          [FC]
+ *            |
+ *          [Res]
+ *
+ *  AFTER
+ *   [IFM1] [IFM2]
+ *        \   |
+ *      [BatchMatMul] [BIAS]
+ *              \      /
+ *               [Add]
+ *                 |
+ *               [Res]
+ *
+ */
+struct FCGraphlet
+{
+public:
+  FCGraphlet() = default;
+  virtual ~FCGraphlet() = default;
+
+  void init(loco::Graph *g, const ShapeU32 r_shape, const float bv)
+  {
+    _tr_y = g->nodes()->create<luci::CircleTranspose>();
+    _tr_y->a(_y);
+    std::vector<int32_t> tr_val = {1, 0};
+    _tr_y->perm(create_const_node(g, loco::DataType::S32, {2}, tr_val));
+
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->input(_x);
+    _fc->weights(_tr_y);
+    _fc->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _fc->dtype(loco::DataType::FLOAT32);
+    _fc->shape(r_shape);
+    auto l = _fc->dim(_fc->rank() - 1).value();
+    std::vector<float> bias_val(l, bv);
+    _fc->bias(create_const_node(g, loco::DataType::FLOAT32, {l}, bias_val));
+    _fc->name("fc");
+  }
+
+public:
+  luci::CircleFullyConnected *fc() { return _fc; }
+
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleTranspose *_tr_y = nullptr;
+  luci::CircleInput *_x = nullptr;
+  luci::CircleInput *_y = nullptr;
+};
+
+struct FCGraph : public TestIsGraphlet<2>, public TestOGraphlet, public FCGraphlet
+{
+  FCGraph() = default;
+  virtual ~FCGraph() = default;
+  void init(const ShapeU32 x_shape, const ShapeU32 y_shape, const ShapeU32 r_shape, const float bv)
+  {
+    TestIsGraphlet<2>::init(g(), {x_shape, y_shape});
+    TestOGraphlet::init(g(), r_shape);
+    _x = input(0);
+    _y = input(1);
+    FCGraphlet::init(g(), r_shape, bv);
+    output()->from(_fc);
+  }
+};
+
+class ReplaceNonConstFCWithBatchMatMulPassTest : public ::testing::Test
+{
+public:
+  FCGraph g;
+  luci::ReplaceNonConstFCWithBatchMatMulPass pass;
+};
+
+} // namespace
+
+TEST_F(ReplaceNonConstFCWithBatchMatMulPassTest, simple_test)
+{
+  g.init({2, 3}, {2, 3}, {2, 2}, 0.0f);
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto mm = dynamic_cast<luci::CircleBatchMatMul *>(g.output()->from());
+  EXPECT_NE(nullptr, mm);
+}
+
+TEST_F(ReplaceNonConstFCWithBatchMatMulPassTest, nonzero_bias_test)
+{
+  g.init({2, 3}, {2, 3}, {2, 2}, 1.0f);
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto mm = dynamic_cast<luci::CircleAdd *>(g.output()->from());
+  EXPECT_NE(nullptr, mm);
+}
+
+TEST_F(ReplaceNonConstFCWithBatchMatMulPassTest, wrong_op_NEG)
+{
+  loco::Graph g;
+
+  auto inp = g.nodes()->create<luci::CircleInput>();
+  auto relu = g.nodes()->create<luci::CircleRelu>();
+  relu->features(inp);
+
+  luci::ReplaceNonConstFCWithBatchMatMulPass pass;
+  auto changed = pass.run(&g);
+
+  EXPECT_EQ(false, changed);
+}
diff --git a/compiler/luci/pass/src/ResolveCustomOpSplitVPass.cpp b/compiler/luci/pass/src/ResolveCustomOpSplitVPass.cpp
new file mode 100644
index 000000000..a65065800
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpSplitVPass.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpSplitVPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Service/Nodes/CircleConst.h>
+
+namespace
+{
+
+// Input node is const S64
+// Return s32 version of node
+// Return nullptr if s64 value is out of range of s32
+luci::CircleConst *s64_to_s32(luci::CircleConst *node)
+{
+  assert(node);
+  assert(node->dtype() == loco::DataType::S64);
+
+  auto cloned = luci::clone(node);
+  luci::add_origin(cloned, luci::get_origin(node));
+
+  const auto num_elems = node->size<loco::DataType::S64>();
+
+  cloned->dtype(loco::DataType::S32);
+  cloned->size<loco::DataType::S32>(num_elems);
+
+  for (uint32_t i = 0; i < num_elems; i++)
+  {
+    int64_t val = node->at<loco::DataType::S64>(i);
+    if (val < std::numeric_limits<int32_t>::min() or val > std::numeric_limits<int32_t>::max())
+      return nullptr;
+
+    cloned->at<loco::DataType::S32>(i) = static_cast<int32_t>(val);
+  }
+
+  return cloned;
+}
+
+/** BEFORE
+ *
+ *        [CircleNode]
+ *              \
+ *               \   [size_splits]  [split_dim]
+ *                \       |             /
+ *               [CircleCustom(SplitV))]
+ *                        |
+ *                 [CircleCustomOut]
+ *                        |
+ *                   [CircleNode]
+ *
+ *  AFTER
+ *
+ *                [CircleNode]
+ *                  |   \
+ *                  |     \   [size_splits]  [split_dim]
+ *                  |      \       |         /
+ *                  |       \      |       /
+ *                  |        \     |      /
+ *    [CircleCustom(SplitV)]  [CircleSplitV]
+ *                  |              |
+ *      [CircleCustomOut]    [CircleSplitVOut]
+ *                                 |
+ *                            [CircleNode]
+ */
+bool resolve_splitv(luci::CircleCustom *node)
+{
+  const std::string custom_code = node->custom_code();
+  const std::vector<uint8_t> custom_options = node->custom_options();
+
+  if (custom_code != "SplitV")
+    return false;
+
+  if (node->numInputs() != 3)
+    return false;
+
+  auto size_splits = dynamic_cast<luci::CircleConst *>(node->inputs(1));
+  if (not size_splits)
+    return false;
+
+  // Convert size_splits to S32, because luci-interpeter does not support
+  // S64 size_splits yet
+  // TODO Support S64 size_splits
+  if (size_splits->dtype() == loco::DataType::S64)
+  {
+    size_splits = s64_to_s32(size_splits);
+    if (not size_splits)
+      return false;
+  }
+  if (size_splits->dtype() != loco::DataType::S32)
+    return false;
+
+  auto split_dim = dynamic_cast<luci::CircleConst *>(node->inputs(2));
+  if (not split_dim)
+    return false;
+
+  if (split_dim->dtype() == loco::DataType::S64)
+  {
+    split_dim = s64_to_s32(split_dim);
+    if (not split_dim)
+      return false;
+  }
+  if (split_dim->dtype() != loco::DataType::S32)
+    return false;
+
+  if (size_splits->rank() != 1)
+    return false;
+
+  const auto num_split = size_splits->dim(0).value();
+
+  auto split_v = node->graph()->nodes()->create<luci::CircleSplitV>();
+  split_v->input(node->inputs(0));
+  split_v->size_splits(size_splits);
+  split_v->split_dim(split_dim);
+  split_v->num_split(num_split);
+  split_v->name(node->name());
+  luci::add_origin(split_v, luci::get_origin(node));
+
+  int32_t i = 0;
+  const auto succs = loco::succs(node);
+  for (auto succ : succs)
+  {
+    auto custom_out = loco::must_cast<luci::CircleCustomOut *>(succ); // FIX_CALLER_UNLESS
+
+    auto split_v_out = node->graph()->nodes()->create<luci::CircleSplitVOut>();
+    split_v_out->input(split_v);
+    split_v_out->name(node->name() + "_out_" + std::to_string(i));
+    split_v_out->index(i++);
+    luci::add_origin(split_v_out, luci::get_origin(node));
+    loco::replace(custom_out).with(split_v_out);
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ResolveCustomOpSplitVPass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto cop = dynamic_cast<luci::CircleCustom *>(node);
+    if (not cop)
+      continue;
+
+    if (resolve_splitv(cop))
+      changed = true;
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ResolveCustomOpSplitVPass.test.cpp b/compiler/luci/pass/src/ResolveCustomOpSplitVPass.test.cpp
new file mode 100644
index 000000000..e7738aadb
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpSplitVPass.test.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpSplitVPass.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/CircleNodes.h>
+#include <gtest/gtest.h>
+
+using namespace luci::test;
+
+namespace
+{
+
+/**
+ *  graph having Custom operator SplitV
+ *
+ *        [Input]  [Const] [Const]
+ *             \    |    /
+ *           [Custom(SplitV)]
+ *             /    |       \
+ *  [CustomOut] [CustomOut] [CustomOut]
+ *       |          |           |
+ *   [Output]   [Output]     [Output]
+ */
+class SplitVGraphlet
+{
+public:
+  SplitVGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    // CircleCustom(SplitV)
+    _splitv = g->nodes()->create<luci::CircleCustom>(3, 3);
+    _splitv->custom_code("SplitV");
+    _splitv->shape({1, 2, 2, 192});
+    _splitv->dtype(loco::DataType::FLOAT32);
+    _splitv->name("splitv");
+
+    // CircleConst
+    auto size_splits = g->nodes()->create<luci::CircleConst>();
+    size_splits->dtype(loco::DataType::S64);
+    size_splits->shape({3});
+    size_splits->size<loco::DataType::S64>(3);
+    size_splits->at<loco::DataType::S64>(0) = 32;
+    size_splits->at<loco::DataType::S64>(1) = 32;
+    size_splits->at<loco::DataType::S64>(2) = 128;
+
+    // CircleConst
+    auto split_dim = g->nodes()->create<luci::CircleConst>();
+    split_dim->dtype(loco::DataType::S32);
+    split_dim->rank(0);
+    split_dim->size<loco::DataType::S32>(1);
+    split_dim->scalar<loco::DataType::S32>() = 3;
+
+    _splitv->inputs(1, size_splits);
+    _splitv->inputs(2, split_dim);
+
+    // CircleCustomOut
+    _splitv_out1 = g->nodes()->create<luci::CircleCustomOut>();
+    _splitv_out1->shape({1, 2, 2, 32});
+    _splitv_out1->dtype(loco::DataType::FLOAT32);
+    _splitv_out1->index(0);
+    _splitv_out1->input(_splitv);
+
+    // CircleCustomOut
+    _splitv_out2 = g->nodes()->create<luci::CircleCustomOut>();
+    _splitv_out2->shape({1, 2, 2, 32});
+    _splitv_out2->dtype(loco::DataType::FLOAT32);
+    _splitv_out2->index(1);
+    _splitv_out2->input(_splitv);
+
+    // CircleCustomOut
+    _splitv_out3 = g->nodes()->create<luci::CircleCustomOut>();
+    _splitv_out3->shape({1, 2, 2, 128});
+    _splitv_out3->dtype(loco::DataType::FLOAT32);
+    _splitv_out3->index(2);
+    _splitv_out3->input(_splitv);
+  }
+
+public:
+  luci::CircleCustom *splitv() { return _splitv; }
+
+protected:
+  luci::CircleCustom *_splitv = nullptr;
+  luci::CircleCustomOut *_splitv_out1 = nullptr;
+  luci::CircleCustomOut *_splitv_out2 = nullptr;
+  luci::CircleCustomOut *_splitv_out3 = nullptr;
+};
+
+class SplitVGraph : public TestIGraphlet, public TestOsGraphlet<3>, public SplitVGraphlet
+{
+public:
+  SplitVGraph() = default;
+
+  void init(void)
+  {
+    TestIGraphlet::init(g(), {1, 2, 2, 192});
+    TestOsGraphlet<3>::init(g(), {{1, 2, 2, 32}, {1, 2, 2, 32}, {1, 2, 2, 128}});
+    SplitVGraphlet::init(g());
+
+    // connect graph
+    _splitv->inputs(0, input());
+
+    output(0)->from(_splitv_out1);
+    output(1)->from(_splitv_out2);
+    output(2)->from(_splitv_out3);
+  }
+};
+
+class SplitVGraphTest : public ::testing::Test
+{
+public:
+  SplitVGraph g;
+  luci::ResolveCustomOpSplitVPass pass;
+};
+
+} // namespace
+
+TEST_F(SplitVGraphTest, simple_test)
+{
+  g.init();
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto svo_1 = dynamic_cast<luci::CircleSplitVOut *>(g.output(0)->from());
+  EXPECT_NE(nullptr, svo_1);
+  auto svo_2 = dynamic_cast<luci::CircleSplitVOut *>(g.output(1)->from());
+  EXPECT_NE(nullptr, svo_2);
+  auto svo_3 = dynamic_cast<luci::CircleSplitVOut *>(g.output(2)->from());
+  EXPECT_NE(nullptr, svo_3);
+
+  auto sv = dynamic_cast<luci::CircleSplitV *>(svo_1->input());
+  EXPECT_NE(nullptr, sv);
+  sv = dynamic_cast<luci::CircleSplitV *>(svo_2->input());
+  EXPECT_NE(nullptr, sv);
+  sv = dynamic_cast<luci::CircleSplitV *>(svo_3->input());
+  EXPECT_NE(nullptr, sv);
+
+  auto size_splits = loco::must_cast<luci::CircleConst *>(sv->size_splits());
+  EXPECT_EQ(loco::DataType::S32, size_splits->dtype());
+  EXPECT_EQ(32, size_splits->at<loco::DataType::S32>(0));
+  EXPECT_EQ(32, size_splits->at<loco::DataType::S32>(1));
+  EXPECT_EQ(128, size_splits->at<loco::DataType::S32>(2));
+
+  auto split_dim = loco::must_cast<luci::CircleConst *>(sv->split_dim());
+  EXPECT_EQ(loco::DataType::S32, split_dim->dtype());
+  EXPECT_EQ(3, split_dim->scalar<loco::DataType::S32>());
+}
+
+TEST_F(SplitVGraphTest, wrong_op_NEG)
+{
+  g.init();
+
+  g.splitv()->custom_code("AddV2");
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(false, ret);
+}
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
index 442183c18..408e6b8d9 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
@@ -197,6 +197,13 @@ private:
     return true;
   }
 
+  bool visit(const luci::CircleReduceMax *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
   bool visit(const luci::CircleRelu *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node));
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
index 4e1c062c0..cf86acabe 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
@@ -302,6 +302,15 @@ bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CirclePow *nod
 }
 
 template <loco::DataType Qtype, loco::DataType Btype>
+bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CircleReduceMax *node)
+{
+  RETURN_FALSE_UNLESS(has_type(node, Qtype))
+  RETURN_FALSE_UNLESS(has_type(node->input(), Qtype))
+  RETURN_FALSE_UNLESS(has_type(node->reduction_indices(), loco::DataType::S32))
+  return true;
+}
+
+template <loco::DataType Qtype, loco::DataType Btype>
 bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CircleRelu *node)
 {
   return group_has_type(node, Qtype);
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.h b/compiler/luci/pass/src/VerifyQuantizedNodeType.h
index ff1acbd6f..789d3c7cd 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeType.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.h
@@ -104,6 +104,7 @@ private:
   bool visit(const luci::CirclePadV2 *node);
   bool visit(const luci::CirclePRelu *node);
   bool visit(const luci::CirclePow *node);
+  bool visit(const luci::CircleReduceMax *node);
   bool visit(const luci::CircleRelu *node);
   bool visit(const luci::CircleReshape *node);
   bool visit(const luci::CircleResizeBilinear *node);
diff --git a/compiler/luci/pass/src/helpers/SparsityFormatConverter.cpp b/compiler/luci/pass/src/helpers/SparsityFormatConverter.cpp
new file mode 100644
index 000000000..72b7d60ff
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/SparsityFormatConverter.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// codes under namespace sparsity referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
+
+#include "SparsityFormatConverter.h"
+
+#include <oops/InternalExn.h>
+
+#include <cassert>
+
+namespace sparsity
+{
+
+namespace
+{
+
+uint64_t GetFlattenedIndex(const std::vector<int> &indices, const std::vector<int> &shape)
+{
+  uint64_t index = 0;
+  int sub_elements = 1;
+  for (int i = shape.size() - 1; i >= 0; i--)
+  {
+    index += indices[i] * sub_elements;
+    sub_elements *= shape[i];
+  }
+  return index;
+}
+
+std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray *int_array)
+{
+  std::vector<int> values;
+  if (!int_array)
+  {
+    return values;
+  }
+
+  values.resize(int_array->size);
+  for (int i = 0; i < int_array->size; i++)
+  {
+    values[i] = int_array->data[i];
+  }
+
+  return values;
+}
+
+} // namespace
+
+template <typename T>
+FormatConverter<T>::FormatConverter(const std::vector<int> &shape, const TfLiteSparsity &sparsity)
+{
+  auto traversal_order = TfLiteIntArrayToVector(sparsity.traversal_order);
+  auto block_map = TfLiteIntArrayToVector(sparsity.block_map);
+
+  std::vector<TfLiteDimensionType> format(sparsity.dim_metadata_size);
+  std::vector<int> dense_size(sparsity.dim_metadata_size);
+  std::vector<std::vector<int>> segments(sparsity.dim_metadata_size);
+  std::vector<std::vector<int>> indices(sparsity.dim_metadata_size);
+  for (int i = 0; i < sparsity.dim_metadata_size; i++)
+  {
+    format[i] = sparsity.dim_metadata[i].format;
+    dense_size[i] = sparsity.dim_metadata[i].dense_size;
+    segments[i] = TfLiteIntArrayToVector(sparsity.dim_metadata[i].array_segments);
+    indices[i] = TfLiteIntArrayToVector(sparsity.dim_metadata[i].array_indices);
+  }
+
+  InitSparseToDenseConverter(shape, std::move(traversal_order), std::move(format),
+                             std::move(dense_size), std::move(segments), std::move(indices),
+                             std::move(block_map));
+}
+
+template <typename T>
+void FormatConverter<T>::InitSparseToDenseConverter(
+  std::vector<int> shape, std::vector<int> traversal_order, std::vector<TfLiteDimensionType> format,
+  std::vector<int> dense_size, std::vector<std::vector<int>> segments,
+  std::vector<std::vector<int>> indices, std::vector<int> block_map)
+{
+  dense_shape_ = std::move(shape);
+  traversal_order_ = std::move(traversal_order);
+  block_map_ = std::move(block_map);
+  format_ = std::move(format);
+
+  dense_size_ = 1;
+  for (size_t i = 0; i < dense_shape_.size(); i++)
+  {
+    dense_size_ *= dense_shape_[i];
+  }
+
+  dim_metadata_.resize(2 * format_.size());
+  for (size_t i = 0; i < format_.size(); i++)
+  {
+    if (format_[i] == kTfLiteDimDense)
+    {
+      dim_metadata_[2 * i] = {dense_size[i]};
+    }
+    else
+    {
+      dim_metadata_[2 * i] = std::move(segments[i]);
+      dim_metadata_[2 * i + 1] = std::move(indices[i]);
+    }
+  }
+
+  int original_rank = dense_shape_.size();
+  int block_dim = 0;
+
+  blocked_shape_.resize(original_rank);
+  block_size_.resize(block_map_.size());
+  for (int i = 0; i < original_rank; i++)
+  {
+    if (block_dim < (int)block_map_.size() && block_map_[block_dim] == i)
+    {
+      if (original_rank + block_dim < (int)traversal_order_.size())
+      {
+        int orig_dim = traversal_order_[original_rank + block_dim];
+        block_size_[block_dim] = dense_size[orig_dim];
+        blocked_shape_[i] = dense_shape_[i] / dense_size[orig_dim];
+        block_dim++;
+      }
+    }
+    else
+    {
+      blocked_shape_[i] = dense_shape_[i];
+    }
+  }
+}
+
+template <typename T>
+void FormatConverter<T>::Populate(const T *src_data, std::vector<int> indices, int level,
+                                  int prev_idx, int *src_data_ptr, T *dest_data)
+{
+  if (static_cast<size_t>(level) == indices.size())
+  {
+    int orig_rank = dense_shape_.size();
+    std::vector<int> orig_idx;
+    orig_idx.resize(orig_rank);
+    int i = 0;
+    for (; static_cast<size_t>(i) < orig_idx.size(); i++)
+    {
+      int orig_dim = traversal_order_[i];
+      orig_idx[orig_dim] = indices[i];
+    }
+
+    for (; static_cast<size_t>(i) < indices.size(); i++)
+    {
+      const int block_idx = traversal_order_[i] - orig_rank;
+      const int orig_dim = block_map_[block_idx];
+      orig_idx[orig_dim] = orig_idx[orig_dim] * block_size_[block_idx] + indices[i];
+    }
+
+    dest_data[GetFlattenedIndex(orig_idx, dense_shape_)] = src_data[*src_data_ptr];
+
+    *src_data_ptr = *src_data_ptr + 1;
+    return;
+  }
+
+  const int metadata_idx = 2 * level;
+  const int shape_of_level = dim_metadata_[metadata_idx][0];
+  if (format_[level] == kTfLiteDimDense)
+  {
+    for (int i = 0; i < shape_of_level; i++)
+    {
+      indices[level] = i;
+      Populate(src_data, indices, level + 1, prev_idx * shape_of_level + i, src_data_ptr,
+               dest_data);
+    }
+  }
+  else if (static_cast<size_t>(prev_idx + 1) < dim_metadata_[metadata_idx].size())
+  {
+    const auto &array_segments = dim_metadata_[metadata_idx];
+    const auto &array_indices = dim_metadata_[metadata_idx + 1];
+    for (int i = array_segments[prev_idx]; i < array_segments[prev_idx + 1]; i++)
+    {
+      if (static_cast<size_t>(i) < array_indices.size() &&
+          static_cast<size_t>(level) < indices.size())
+      {
+        indices[level] = array_indices[i];
+        Populate(src_data, indices, level + 1, i, src_data_ptr, dest_data);
+      }
+    }
+  }
+}
+
+template <typename T> bool FormatConverter<T>::SparseToDense(const T *src_data)
+{
+  data_.resize(dense_size_);
+  std::fill(data_.begin(), data_.end(), T(0));
+
+  int total_rank = traversal_order_.size();
+  int src_data_ptr = 0;
+  std::vector<int> indices(total_rank);
+  Populate(src_data, indices, 0, 0, &src_data_ptr, data_.data());
+
+  return true;
+}
+
+template class FormatConverter<float>;
+template class FormatConverter<uint16_t>;
+
+} // namespace sparsity
+
+#include <luci/IR/SparsityParam.h>
+
+namespace luci
+{
+
+sparsity::TfLiteDimensionType to_tflite_sparsity(luci::DimensionType dt)
+{
+  switch (dt)
+  {
+    case luci::DimensionType::DENSE:
+      return sparsity::TfLiteDimensionType::kTfLiteDimDense;
+    case luci::DimensionType::SPARSE_CSR:
+      return sparsity::TfLiteDimensionType::kTfLiteDimSparseCSR;
+  }
+  return sparsity::TfLiteDimensionType::kTfLiteDimDense;
+}
+
+sparsity::TfLiteIntArray *to_tflite_sparsity(const luci::SparseIndexVector &data)
+{
+  auto type = data.type();
+  switch (type)
+  {
+    case luci::SparseIndexVectorType::NONE:
+    {
+      std::vector<int32_t> empty;
+      return makeTfLiteArray(empty);
+    }
+    case luci::SparseIndexVectorType::I32:
+      return makeTfLiteArray<int32_t>(*data.as_int32_vector());
+    case luci::SparseIndexVectorType::U16:
+      return makeTfLiteArray<uint16_t>(*data.as_uint16_vector());
+    case luci::SparseIndexVectorType::U8:
+      return makeTfLiteArray<uint8_t>(*data.as_uint8_vector());
+    default:
+      INTERNAL_EXN_V("unsupported SparseIndexVectorType", oops::to_uint32(type));
+  }
+}
+
+sparsity::TfLiteSparsity to_tflite_sparsity(const luci::SparsityParam *sp)
+{
+  sparsity::TfLiteSparsity tflsp;
+  tflsp.traversal_order = makeTfLiteArray(sp->traversal_order);
+  tflsp.block_map = makeTfLiteArray(sp->block_map);
+  tflsp.dim_metadata = makeTfLiteDimensionMetadata(sp->dim_metadata);
+  tflsp.dim_metadata_size = sp->dim_metadata.size();
+  return tflsp;
+}
+
+template <typename T> sparsity::TfLiteIntArray *makeTfLiteArray(const std::vector<T> &data)
+{
+  size_t cn = data.size();
+  size_t sz = 1 + data.size();
+  sparsity::TfLiteIntArray *sp = (sparsity::TfLiteIntArray *)(new int[sz]);
+  sp->size = cn;
+  for (size_t i = 0; i < cn; ++i)
+  {
+    sp->data[i] = data[i];
+  }
+  return sp;
+}
+
+sparsity::TfLiteDimensionMetadata *
+makeTfLiteDimensionMetadata(const std::vector<luci::DimMetaData> &data)
+{
+  size_t cn = data.size();
+  sparsity::TfLiteDimensionMetadata *tfldm = new sparsity::TfLiteDimensionMetadata[cn];
+
+  for (size_t i = 0; i < cn; ++i)
+  {
+    tfldm[i].format = to_tflite_sparsity(data[i].format());
+    tfldm[i].dense_size = data[i].dense_size();
+    tfldm[i].array_segments = to_tflite_sparsity(data[i].array_segments());
+    tfldm[i].array_indices = to_tflite_sparsity(data[i].array_indices());
+  }
+
+  return tfldm;
+}
+
+void freeTfLiteSparsity(sparsity::TfLiteSparsity &tflsp)
+{
+  assert(tflsp.traversal_order);
+  assert(tflsp.block_map);
+  delete[] tflsp.traversal_order;
+  delete[] tflsp.block_map;
+
+  for (int i = 0; i < tflsp.dim_metadata_size; ++i)
+  {
+    assert(tflsp.dim_metadata[i].array_segments);
+    assert(tflsp.dim_metadata[i].array_indices);
+    delete[] tflsp.dim_metadata[i].array_segments;
+    delete[] tflsp.dim_metadata[i].array_indices;
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/SparsityFormatConverter.h b/compiler/luci/pass/src/helpers/SparsityFormatConverter.h
new file mode 100644
index 000000000..fcd9bbcd0
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/SparsityFormatConverter.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_HELPERS_SPARSITY_FORMAT_CONVERTER_H__
+#define __LUCI_PASS_HELPERS_SPARSITY_FORMAT_CONVERTER_H__
+
+#include <cstdint>
+#include <vector>
+
+// codes under namespace sparsity referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
+
+namespace sparsity
+{
+
+// Storage format of each dimension in a sparse tensor.
+typedef enum TfLiteDimensionType
+{
+  kTfLiteDimDense = 0,
+  kTfLiteDimSparseCSR,
+} TfLiteDimensionType;
+
+// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+// indices
+typedef struct TfLiteIntArray
+{
+  int size;
+  int data[];
+} TfLiteIntArray;
+
+// Metadata to encode each dimension in a sparse tensor.
+typedef struct TfLiteDimensionMetadata
+{
+  TfLiteDimensionType format;
+  int dense_size;
+  TfLiteIntArray *array_segments;
+  TfLiteIntArray *array_indices;
+} TfLiteDimensionMetadata;
+
+// Parameters used to encode a sparse tensor. For detailed explanation of each
+// field please refer to lite/schema/schema.fbs.
+typedef struct TfLiteSparsity
+{
+  TfLiteIntArray *traversal_order;
+  TfLiteIntArray *block_map;
+  TfLiteDimensionMetadata *dim_metadata;
+  int dim_metadata_size;
+} TfLiteSparsity;
+
+// A converter that keeps an internal representation of sparse tensor parameters
+// and converts tensors between dense and sparse formats.
+template <typename T> class FormatConverter
+{
+public:
+  /* Creates a sparse to dense converter.
+   * @param shape      Shape of the target dense tensor.
+   * @param sparsity   Sparsity parameter of the sparse TfLiteTensor.
+   */
+  FormatConverter(const std::vector<int> &shape, const TfLiteSparsity &sparsity);
+
+  const std::vector<T> &GetData() { return data_; }
+  const std::vector<std::vector<int>> &GetDimMetadata() { return dim_metadata_; }
+
+  bool SparseToDense(const T *src_data);
+
+private:
+  // Helper function for initializing this converter for sparse to dense
+  // conversion.
+  void InitSparseToDenseConverter(std::vector<int> shape, std::vector<int> traversal_order,
+                                  std::vector<TfLiteDimensionType> format,
+                                  std::vector<int> dense_size,
+                                  std::vector<std::vector<int>> segments,
+                                  std::vector<std::vector<int>> indices,
+                                  std::vector<int> block_map);
+
+  void Populate(const T *src_data, std::vector<int> indices, int level, int prev_idx,
+                int *src_data_ptr, T *dest_data);
+
+private:
+  std::vector<int> dense_shape_;
+  std::vector<int> blocked_shape_;
+  size_t dense_size_;
+  std::vector<int> traversal_order_;
+  std::vector<TfLiteDimensionType> format_;
+  std::vector<int> block_size_;
+  std::vector<int> block_map_;
+  std::vector<std::vector<int>> dim_metadata_;
+  std::vector<T> data_;
+};
+
+extern template class FormatConverter<float>;
+extern template class FormatConverter<uint16_t>;
+
+} // namespace sparsity
+
+#include <luci/IR/SparsityParam.h>
+
+namespace luci
+{
+
+sparsity::TfLiteDimensionType to_tflite_sparsity(luci::DimensionType dt);
+sparsity::TfLiteIntArray *to_tflite_sparsity(const luci::SparseIndexVector &data);
+sparsity::TfLiteSparsity to_tflite_sparsity(const luci::SparsityParam *sp);
+
+template <typename T> sparsity::TfLiteIntArray *makeTfLiteArray(const std::vector<T> &data);
+sparsity::TfLiteDimensionMetadata *
+makeTfLiteDimensionMetadata(const std::vector<luci::DimMetaData> &data);
+
+void freeTfLiteSparsity(sparsity::TfLiteSparsity &tflsp);
+
+} // namespace luci
+
+#endif // __LUCI_PASS_HELPERS_SPARSITY_FORMAT_CONVERTER_H__
diff --git a/compiler/luci/requires.cmake b/compiler/luci/requires.cmake
index e896188be..0a5e6a58b 100644
--- a/compiler/luci/requires.cmake
+++ b/compiler/luci/requires.cmake
@@ -10,4 +10,5 @@ require("oops")
 require("hermes")
 require("hermes-std")
 require("tflchef")
+require("circlechef")
 require("tflite2circle")
diff --git a/compiler/luci/service/src/CircleCloneNode.h b/compiler/luci/service/src/CircleCloneNode.h
index 99e4561b3..95f06db4c 100644
--- a/compiler/luci/service/src/CircleCloneNode.h
+++ b/compiler/luci/service/src/CircleCloneNode.h
@@ -72,6 +72,7 @@ public:
   CloneNodeLet(loco::Graph *graph) : _graph(graph){};
 
 public:
+  luci::CircleNode *visit(const luci::CircleDensify *) final;
   luci::CircleNode *visit(const luci::CircleDepthToSpace *) final;
   luci::CircleNode *visit(const luci::CircleDepthwiseConv2D *) final;
   luci::CircleNode *visit(const luci::CircleDequantize *) final;
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index 9d156f3e2..a368faef4 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -204,6 +204,7 @@ template <class CIRCLENODE> loco::NodeShape broadcast_xy(const CIRCLENODE *node)
     return loco::NodeShape{inputs_shape};                                               \
   }
 
+DECLARE_USE_SINGLE(input);
 DECLARE_USE_SINGLE(inputs);
 DECLARE_USE_SINGLE(x);
 DECLARE_USE_SINGLE(logits);
@@ -258,10 +259,10 @@ loco::NodeShape infer_add_n(const luci::CircleAddN *node)
   return loco::NodeShape{shape};
 }
 
-loco::NodeShape infer_arg_max(const luci::CircleArgMax *node)
+template <class CIRCLENODE> loco::NodeShape infer_arg_maxmin(const CIRCLENODE *node)
 {
-  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
-  auto dimension_shape = luci::shape_get(node->dimension()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).template as<loco::TensorShape>();
+  auto dimension_shape = luci::shape_get(node->dimension()).template as<loco::TensorShape>();
 
   int64_t select_axis = 0;
   {
@@ -271,55 +272,19 @@ loco::NodeShape infer_arg_max(const luci::CircleArgMax *node)
     // Support S32 for now.
     auto const_shape_node = loco::must_cast<luci::CircleConst *>(node->dimension());
     LUCI_ASSERT(const_shape_node->dtype() == loco::DataType::S32,
-                "Only support int32 CircleConst for CircleArgMax");
+                "Only support int32 CircleConst for CircleArgMax/CircleArgMin");
 
     if (const_shape_node->rank() > 1)
       INTERNAL_EXN_V("Only support rank 0/1 CircleConst",
                      oops::to_uint32(const_shape_node->rank()));
 
-    select_axis = const_shape_node->scalar<loco::DataType::S32>();
-  }
-  assert(select_axis < input_shape.rank());
-  assert(select_axis >= 0); // TODO support minus of this breaks
-
-  // NOTE select_axis is removed
-  loco::TensorShape shape_output;
-  uint32_t rank = input_shape.rank();
-  uint32_t shrink = static_cast<uint32_t>(select_axis);
-  assert(rank > 0);
-  shape_output.rank(rank - 1);
-  for (uint32_t r = 0, d = 0; r < rank; ++r)
-  {
-    if (r == shrink)
-      continue;
-    shape_output.dim(d++) = input_shape.dim(r);
+    select_axis = const_shape_node->template scalar<loco::DataType::S32>();
   }
-  return loco::NodeShape{shape_output};
-}
-
-loco::NodeShape infer_arg_min(const luci::CircleArgMin *node)
-{
-  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
-  auto dimension_shape = luci::shape_get(node->dimension()).as<loco::TensorShape>();
-
-  int64_t select_axis = 0;
-  {
-    LUCI_ASSERT(node->dimension(), "2nd input dimension() should not be nullptr");
-
-    // Only support node's shape() is CircleConst with S32/S64
-    // Support S32 for now.
-    auto const_shape_node = loco::must_cast<luci::CircleConst *>(node->dimension());
-    LUCI_ASSERT(const_shape_node->dtype() == loco::DataType::S32,
-                "Only support int32 CircleConst for CircleArgMin");
-
-    if (const_shape_node->rank() > 1)
-      INTERNAL_EXN_V("Only support rank 0/1 CircleConst",
-                     oops::to_uint32(const_shape_node->rank()));
 
-    select_axis = const_shape_node->scalar<loco::DataType::S32>();
-  }
   assert(select_axis < input_shape.rank());
-  assert(select_axis >= 0); // TODO support minus of this breaks
+
+  if (select_axis < 0)
+    select_axis += input_shape.rank();
 
   // NOTE select_axis is removed
   loco::TensorShape shape_output;
@@ -1180,45 +1145,17 @@ loco::NodeShape infer_reshape(const luci::CircleReshape *node)
   return loco::NodeShape{output_shape};
 }
 
-loco::NodeShape infer_resize_bilinear(const luci::CircleResizeBilinear *node)
+template <class CIRCLENODE> loco::NodeShape infer_resize_type(const CIRCLENODE *node)
 {
-  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
-
-  if (input_shape.rank() != 4)
-    INTERNAL_EXN("Expected ResizeBilinear input to have rank 4");
-
-  auto *const_node = loco::must_cast<luci::CircleConst *>(node->size());
-
-  if (const_node->dtype() != loco::DataType::S32)
-    INTERNAL_EXN("Only S32 datatype is supported for ResizeBilinear size");
-
-  if (const_node->rank() != 1)
-    INTERNAL_EXN("Expected size tensor of rank 1");
-
-  if (const_node->dim(0).value() != 2)
-    INTERNAL_EXN("Expected size tensor with shape [2]");
-
-  loco::TensorShape output_shape;
-  output_shape.rank(4);
-  output_shape.dim(0) = input_shape.dim(0);
-  output_shape.dim(1) = const_node->at<loco::DataType::S32>(0);
-  output_shape.dim(2) = const_node->at<loco::DataType::S32>(1);
-  output_shape.dim(3) = input_shape.dim(3);
-
-  return loco::NodeShape{output_shape};
-}
-
-loco::NodeShape infer_resize_nearest_neighbor(const luci::CircleResizeNearestNeighbor *node)
-{
-  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).template as<loco::TensorShape>();
 
   if (input_shape.rank() != 4)
-    INTERNAL_EXN("Expected ResizeNearesNeighbor input to have rank 4");
+    INTERNAL_EXN("Expected input to have rank 4");
 
   auto *const_node = loco::must_cast<luci::CircleConst *>(node->size());
 
   if (const_node->dtype() != loco::DataType::S32)
-    INTERNAL_EXN("Only S32 datatype is supported for ResizeNearesNeighbor size");
+    INTERNAL_EXN("Only S32 datatype is supported for size");
 
   if (const_node->rank() != 1)
     INTERNAL_EXN("Expected size tensor of rank 1");
@@ -1229,8 +1166,8 @@ loco::NodeShape infer_resize_nearest_neighbor(const luci::CircleResizeNearestNei
   loco::TensorShape output_shape;
   output_shape.rank(4);
   output_shape.dim(0) = input_shape.dim(0);
-  output_shape.dim(1) = const_node->at<loco::DataType::S32>(0);
-  output_shape.dim(2) = const_node->at<loco::DataType::S32>(1);
+  output_shape.dim(1) = const_node->template at<loco::DataType::S32>(0);
+  output_shape.dim(2) = const_node->template at<loco::DataType::S32>(1);
   output_shape.dim(3) = input_shape.dim(3);
 
   return loco::NodeShape{output_shape};
@@ -2080,9 +2017,9 @@ public:
 
   loco::NodeShape visit(const luci::CircleAddN *node) final { return infer_add_n(node); }
 
-  loco::NodeShape visit(const luci::CircleArgMax *node) final { return infer_arg_max(node); }
+  loco::NodeShape visit(const luci::CircleArgMax *node) final { return infer_arg_maxmin(node); }
 
-  loco::NodeShape visit(const luci::CircleArgMin *node) final { return infer_arg_min(node); }
+  loco::NodeShape visit(const luci::CircleArgMin *node) final { return infer_arg_maxmin(node); }
 
   loco::NodeShape visit(const luci::CircleAveragePool2D *node) final
   {
@@ -2119,6 +2056,8 @@ public:
 
   loco::NodeShape visit(const luci::CircleCustom *node) final { return use_own(node); }
 
+  loco::NodeShape visit(const luci::CircleDensify *node) final { return use_input(node); }
+
   loco::NodeShape visit(const luci::CircleDepthToSpace *node) final
   {
     return infer_depth_to_space(node);
@@ -2348,12 +2287,12 @@ public:
 
   loco::NodeShape visit(const luci::CircleResizeBilinear *node) final
   {
-    return infer_resize_bilinear(node);
+    return infer_resize_type(node);
   }
 
   loco::NodeShape visit(const luci::CircleResizeNearestNeighbor *node) final
   {
-    return infer_resize_nearest_neighbor(node);
+    return infer_resize_type(node);
   }
 
   loco::NodeShape visit(const luci::CircleReverseSequence *node) final
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index 438c4a364..7616390ae 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -102,6 +102,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return node->dtype();
   }
 
+  loco::DataType visit(const luci::CircleDensify *node) final
+  {
+    return luci::dtype_get(node->input());
+  }
+
   loco::DataType visit(const luci::CircleDepthToSpace *node) final
   {
     return luci::dtype_get(node->input());
diff --git a/compiler/luci/service/src/Nodes/CircleDensify.cpp b/compiler/luci/service/src/Nodes/CircleDensify.cpp
new file mode 100644
index 000000000..a0d15b6c7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDensify.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNodeLet<CN::DEF>::visit(const luci::CircleDensify *)
+{
+  return _graph->nodes()->create<luci::CircleDensify>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleDensify.test.cpp b/compiler/luci/service/src/Nodes/CircleDensify.test.cpp
new file mode 100644
index 000000000..d0f32c1a2
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDensify.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Densify)
+{
+  auto g = loco::make_graph();
+  auto node_densify = g->nodes()->create<luci::CircleDensify>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_densify, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_densify = dynamic_cast<luci::CircleDensify *>(cloned);
+  ASSERT_NE(nullptr, cloned_densify);
+}
diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
index c5864f938..77135cca0 100644
--- a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
+++ b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
@@ -24,16 +24,22 @@
 #include <loco/IR/NodeShape.h>
 #include <oops/InternalExn.h>
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <limits>
 
+// code referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//    tensorflow/lite/kernels/strided_slice.cc
+//    tensorflow/lite/kernels/internal/strided_slice_logic.h
+
 namespace
 {
 
-// This Op only supports 1-4D cases and since we use the reference 4D
+// This Op only supports 1-5D cases and since we use the reference 4D
 // implementation, the 1-3D tensors are mapped to 4D.
-const int kMaxDim = 4;
+const int kMaxDim = 5;
 
 const loco::DataType S32 = loco::DataType::S32;
 
@@ -42,18 +48,47 @@ using int16 = int16_t;
 
 struct StridedSliceParams
 {
-  int8 start_indices_count;
+  int8 start_indices_count = 0;
   int16 start_indices[kMaxDim];
-  int8 stop_indices_count;
+  int8 stop_indices_count = 0;
   int16 stop_indices[kMaxDim];
-  int8 strides_count;
+  int8 strides_count = 0;
   int16 strides[kMaxDim];
 
-  int16 begin_mask;
-  int16 ellipsis_mask;
-  int16 end_mask;
-  int16 new_axis_mask;
-  int16 shrink_axis_mask;
+  int16 begin_mask = 0;
+  int16 ellipsis_mask = 0;
+  int16 end_mask = 0;
+  int16 new_axis_mask = 0;
+  int16 shrink_axis_mask = 0;
+};
+
+struct StridedSliceContext
+{
+  StridedSliceContext(const luci::CircleStridedSlice *node)
+  {
+    params.begin_mask = node->begin_mask();
+    params.ellipsis_mask = node->ellipsis_mask();
+    params.end_mask = node->end_mask();
+    params.new_axis_mask = node->new_axis_mask();
+    params.shrink_axis_mask = node->shrink_axis_mask();
+
+    input = loco::must_cast<luci::CircleNode *>(node->input());
+    begin = loco::must_cast<luci::CircleConst *>(node->begin());
+    end = loco::must_cast<luci::CircleConst *>(node->end());
+    strides = loco::must_cast<luci::CircleConst *>(node->strides());
+
+    loco::TensorShape input_shape = luci::shape_get(input).as<loco::TensorShape>();
+    input_dims = input_shape.rank();
+  }
+  StridedSliceParams params;
+  luci::CircleNode *input = nullptr;
+  luci::CircleConst *begin = nullptr;
+  luci::CircleConst *end = nullptr;
+  luci::CircleConst *strides = nullptr;
+
+  // Equivalent input shape after adding axis according to new_axis_mask.
+  loco::TensorShape effective_input_shape;
+  uint32_t input_dims = 0;
 };
 
 // Use until std::clamp() is available from C++17.
@@ -70,8 +105,8 @@ inline int Clamp(const int32_t v, const int32_t lo, const int32_t hi)
 // Return the index for the first element along that axis. This index will be a
 // positive integer between [0, axis_size - 1] that can be used to index
 // directly into the data.
-inline int StartForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
-                        uint32_t axis)
+inline int32_t StartForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
+                            uint32_t axis)
 {
   const auto begin_mask = params.begin_mask;
   const auto *start_indices = params.start_indices;
@@ -108,7 +143,16 @@ inline int StartForAxis(const StridedSliceParams &params, const loco::TensorShap
   }
 
   // Clamping
-  start = Clamp(start, 0, axis_size - 1);
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    start = Clamp(start, 0, axis_size);
+  }
+  else
+  {
+    // Backward iteration
+    start = Clamp(start, -1, axis_size - 1);
+  }
 
   return start;
 }
@@ -118,14 +162,14 @@ inline int StartForAxis(const StridedSliceParams &params, const loco::TensorShap
 // element. ie. So if you were iterating through all elements of a 1D array of
 // size 4, this function would return 4 as the stop, because it is one past the
 // "real" indices of 0, 1, 2 & 3.
-inline int StopForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
-                       int axis, int start_for_axis)
+inline int32_t StopForAxis(const StridedSliceParams &params, const loco::TensorShape &input_shape,
+                           int32_t axis, int32_t start_for_axis)
 {
   const auto end_mask = params.end_mask;
   const auto shrink_axis_mask = params.shrink_axis_mask;
   const auto *stop_indices = params.stop_indices;
   const auto *strides = params.strides;
-  const int axis_size = static_cast<int32_t>(input_shape.dim(axis).value());
+  const int32_t axis_size = static_cast<int32_t>(input_shape.dim(axis).value());
   if (axis_size == 0)
   {
     return 0;
@@ -141,7 +185,7 @@ inline int StopForAxis(const StridedSliceParams &params, const loco::TensorShape
   // already been adjusted for negative indices.
   if (shrink_axis)
   {
-    stop = start_for_axis + 1;
+    return start_for_axis + 1;
   }
 
   // end_mask override
@@ -183,37 +227,125 @@ inline int StopForAxis(const StridedSliceParams &params, const loco::TensorShape
   return stop;
 }
 
-StridedSliceParams BuildStridedSliceParams(const luci::CircleStridedSlice *node)
+StridedSliceParams BuildStridedSliceParams(StridedSliceContext *op_context)
 {
   StridedSliceParams op_params;
 
-  if (kMaxDim < node->rank())
+  // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks
+  // are processed here to update begin_mask, end_mask and the index range.
+  op_params.begin_mask = 0;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = 0;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = 0;
+
+  // Count indexes where the new_axis_mask is set but the ellipsis_mask is not.
+  loco::TensorShape begin_shape = luci::shape_get(op_context->begin).as<loco::TensorShape>();
+  const uint32_t begin_count = begin_shape.dim(0).value();
+  uint32_t num_add_axis = 0;
+  for (uint32_t i = 0; i < begin_count; ++i)
   {
-    INTERNAL_EXN_V("Cannot support StridedSlice rank > ", kMaxDim);
+    if (!((1 << i) & op_context->params.ellipsis_mask) &&
+        ((1 << i) & op_context->params.new_axis_mask))
+    {
+      num_add_axis++;
+    }
   }
 
-  auto begin_node = loco::must_cast<luci::CircleConst *>(node->begin());
-  auto end_node = loco::must_cast<luci::CircleConst *>(node->end());
-  auto strides_node = loco::must_cast<luci::CircleConst *>(node->strides());
+  // Calculate the dims of input after adding new axises.
+  const uint32_t effective_dims = op_context->input_dims + num_add_axis;
+
+  // If begin, end and strides are not fully provided, it means Ellipsis should
+  // be expanded to multiple dimensions (Ex: for spec [Ellipsis, 2] on a 3D
+  // input, the Ellipsis should be applied for the first 2 dimensions). Besides,
+  // If the new_axis_mask and the ellipsis_mask are set at the same index, the
+  // new_axis_mask will have no effect.
+  int32_t effective_ellipsis_mask = 0, effective_new_axis_mask = 0;
+  uint32_t ellipsis_start_idx = effective_dims, expanded_ellipsis = 0;
+  for (uint32_t i = 0; i < effective_dims;)
+  {
+    if ((1 << i) & op_context->params.ellipsis_mask)
+    {
+      ellipsis_start_idx = i;
+      uint32_t ellipsis_end_idx =
+        std::max(i + 1, std::min(i + 1 + num_add_axis + op_context->input_dims - begin_count,
+                                 effective_dims));
+      expanded_ellipsis = ellipsis_end_idx - ellipsis_start_idx - 1;
+
+      // Set bit for effective_ellipsis_mask.
+      for (; i < ellipsis_end_idx; ++i)
+      {
+        effective_ellipsis_mask |= (1 << i);
+      }
+      continue;
+    }
 
-  uint32_t dims_count = begin_node->size<S32>();
+    if ((1 << (i - expanded_ellipsis)) & op_context->params.new_axis_mask)
+    {
+      effective_new_axis_mask |= (1 << i);
+    }
+    ++i;
+  }
 
-  op_params.start_indices_count = dims_count;
-  op_params.stop_indices_count = dims_count;
-  op_params.strides_count = dims_count;
+  // Calculate effective_input_shape and its corresponding begin, end, strides.
+  loco::TensorShape input_shape = luci::shape_get(op_context->input).as<loco::TensorShape>();
+  uint32_t added_ellipsis = 0, added_axises = 0;
+  op_context->effective_input_shape.rank(effective_dims);
 
-  for (uint32_t i = 0; i < dims_count; ++i)
+  for (uint32_t i = 0; i < effective_dims; ++i)
   {
-    op_params.start_indices[i] = begin_node->at<S32>(i);
-    op_params.stop_indices[i] = end_node->at<S32>(i);
-    op_params.strides[i] = strides_node->at<S32>(i);
+    if ((1 << i) & effective_ellipsis_mask)
+    {
+      // If ellipsis_mask, set the begin_mask and end_mask at that index.
+      added_ellipsis = std::max(0u, i - ellipsis_start_idx);
+      op_params.begin_mask |= (1 << i);
+      op_params.end_mask |= (1 << i);
+      op_params.strides[i] = 1;
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
+    else if ((1 << i) & effective_new_axis_mask)
+    {
+      // If new_axis_mask is set, it is equivalent to adding a new dim of 1 to
+      // input tensor. Store added shape to effective_input_shape.
+      op_params.start_indices[i] = 0;
+      op_params.stop_indices[i] = 1;
+      op_params.strides[i] = 1;
+      op_context->effective_input_shape.dim(i) = loco::Dimension(1);
+      added_axises++;
+    }
+    else if (i >= begin_count + expanded_ellipsis)
+    {
+      op_params.start_indices[i] = 0;
+      op_params.stop_indices[i] = 0;
+      op_params.strides[i] = 1;
+      op_params.begin_mask |= (1 << i);
+      op_params.end_mask |= (1 << i);
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
+    else
+    {
+      const uint32_t orig_idx = i - added_ellipsis;
+      op_params.start_indices[i] = op_context->begin->at<S32>(orig_idx);
+      op_params.stop_indices[i] = op_context->end->at<S32>(orig_idx);
+      op_params.strides[i] = op_context->strides->at<S32>(orig_idx);
+      if (op_context->params.begin_mask & (1 << orig_idx))
+      {
+        op_params.begin_mask |= (1 << i);
+      }
+      if (op_context->params.end_mask & (1 << orig_idx))
+      {
+        op_params.end_mask |= (1 << i);
+      }
+      if (op_context->params.shrink_axis_mask & (1 << orig_idx))
+      {
+        op_params.shrink_axis_mask |= (1 << i);
+      }
+      op_context->effective_input_shape.dim(i) = input_shape.dim(i - added_axises);
+    }
   }
-
-  op_params.begin_mask = node->begin_mask();
-  op_params.ellipsis_mask = 0;
-  op_params.end_mask = node->end_mask();
-  op_params.new_axis_mask = 0;
-  op_params.shrink_axis_mask = node->shrink_axis_mask();
+  op_params.start_indices_count = effective_dims;
+  op_params.stop_indices_count = effective_dims;
+  op_params.strides_count = effective_dims;
 
   return op_params;
 }
@@ -241,55 +373,54 @@ loco::TensorShape infer_output_shape(const CircleStridedSlice *node)
   LUCI_ASSERT(end_node->dtype() == S32, "Only support S32 for end_node");
   LUCI_ASSERT(strides_node->dtype() == S32, "Only support S32 for strides_node");
 
-  assert(node->ellipsis_mask() == 0);
-  assert(node->new_axis_mask() == 0);
+  LUCI_ASSERT(begin_node->rank() == 1, "Only support rank 1 for begin_node");
+  LUCI_ASSERT(end_node->rank() == 1, "Only support rank 1 for end_node");
+  LUCI_ASSERT(strides_node->rank() == 1, "Only support rank 1 for strides_node");
 
-  auto op_params = BuildStridedSliceParams(node);
   loco::TensorShape input_shape = luci::shape_get(input_node).as<loco::TensorShape>();
 
-  uint32_t num_input_axes = input_shape.rank();
-  assert(begin_node->size<S32>() <= num_input_axes);
-  assert(end_node->size<S32>() <= num_input_axes);
-  assert(strides_node->size<S32>() <= num_input_axes);
-  for (uint32_t i = 0; i < strides_node->size<S32>(); i++)
-  {
-    LUCI_ASSERT(strides_node->at<S32>(i) != 0, "Stride value has to be non-zero");
-  }
+  assert(begin_node->size<S32>() <= input_shape.rank());
+  assert(end_node->size<S32>() <= input_shape.rank());
+  assert(strides_node->size<S32>() <= input_shape.rank());
 
-  uint32_t shape_size = 0;
-  std::array<int32_t, 16> output_shape_data;
+  StridedSliceContext op_context(node);
+  auto op_params = BuildStridedSliceParams(&op_context);
+  auto effective_input_shape = op_context.effective_input_shape;
+  std::vector<int32_t> output_shape_vector;
 
-  for (uint32_t idx = 0; idx < num_input_axes; ++idx)
+  for (int32_t idx = effective_input_shape.rank() - 1; idx >= 0; --idx)
   {
-    int32_t begin = StartForAxis(op_params, input_shape, idx);
-    int32_t end = StopForAxis(op_params, input_shape, idx, begin);
-    if (end < 0)
-      end = input_shape.dim(idx).value() + end + 1;
+    int32_t stride = op_params.strides[idx];
+    LUCI_ASSERT(stride != 0, "stride value has to be non-zero");
 
-    // This is valid for both positive and negative strides
-    int32_t stride = strides_node->at<S32>(idx);
-    int32_t dim_shape = std::ceil(static_cast<float>(end - begin) / stride);
-    assert(dim_shape > 0);
+    int32_t begin = StartForAxis(op_params, effective_input_shape, idx);
+    int32_t end = StopForAxis(op_params, effective_input_shape, idx, begin);
 
     // When shrinking an axis, the end position does not matter (and can be
     // incorrect when negative indexing is used, see Issue #19260). Always use
     // begin + 1 to generate a length 1 slice, since begin has
-    // already been adjusted for negative indices by StartForAxis.
-    const bool shrink_axis = node->shrink_axis_mask() & (1 << idx);
+    // already been adjusted for negative indices by GetBeginValueAtIndex.
+    const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
     if (shrink_axis)
     {
-      assert(dim_shape == 1);
+      end = begin + 1;
     }
-    else
+
+    // This is valid for both positive and negative strides
+    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!shrink_axis)
     {
-      output_shape_data[shape_size++] = dim_shape;
+      output_shape_vector.push_back(dim_shape);
     }
   }
 
+  auto shape_size = output_shape_vector.size();
   output_shape.rank(shape_size);
   for (uint32_t idx = 0; idx < shape_size; ++idx)
   {
-    output_shape.dim(idx) = output_shape_data[idx];
+    // reverse copy
+    output_shape.dim(idx) = output_shape_vector.at(shape_size - 1u - idx);
   }
 
   return output_shape;
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 94e723f21..09a25ff08 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -39,6 +39,7 @@ addread(Conv2D_003)
 addread(Conv2D_U8_000)
 addread(Conv2D_U8_001)
 addread(Cos_000)
+addread(Densify_000)
 addread(DepthToSpace_000)
 addread(DepthwiseConv2D_000)
 addread(DepthwiseConv2D_U8_000)
@@ -265,6 +266,7 @@ addwrite(Conv2D_003)
 addwrite(Conv2D_U8_000)
 addwrite(Conv2D_U8_001)
 addwrite(Cos_000)
+addwrite(Densify_000)
 addwrite(DepthToSpace_000)
 addwrite(DepthwiseConv2D_000)
 addwrite(DepthwiseConv2D_U8_000)
diff --git a/compiler/mio-circle04/include/mio_circle/Helper.h b/compiler/mio-circle04/include/mio_circle/Helper.h
index d3ffc23e5..7a1ba2b2f 100644
--- a/compiler/mio-circle04/include/mio_circle/Helper.h
+++ b/compiler/mio-circle04/include/mio_circle/Helper.h
@@ -19,6 +19,8 @@
 
 #include <mio/circle/schema_generated.h>
 
+#include <vector>
+
 namespace mio
 {
 namespace circle
@@ -31,6 +33,21 @@ std::string opcode_name(const ::circle::OperatorCode *opcode);
 const char *tensor_type(const ::circle::Tensor *tensor);
 const char *tensor_name(const ::circle::Tensor *tensor);
 
+template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T> *flat_array)
+{
+  if (flat_array == nullptr)
+  {
+    throw std::runtime_error("flat array is nullptr");
+  }
+
+  std::vector<T> ret(flat_array->Length());
+  for (uint32_t i = 0; i < flat_array->Length(); i++)
+  {
+    ret[i] = flat_array->Get(i);
+  }
+  return ret;
+}
+
 } // namespace circle
 } // namespace mio
 
diff --git a/compiler/circledump/src/Read.h b/compiler/mio-circle04/include/mio_circle/Reader.h
index 05b0e5072..630646732 100644
--- a/compiler/circledump/src/Read.h
+++ b/compiler/mio-circle04/include/mio_circle/Reader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __CIRCLEREAD_READ_H__
-#define __CIRCLEREAD_READ_H__
+#ifndef __MIO_CIRCLE04_READER_H__
+#define __MIO_CIRCLE04_READER_H__
 
 #include <mio/circle/schema_generated.h>
 
@@ -23,23 +23,14 @@
 #include <string>
 #include <vector>
 
-namespace circleread
-{
+// NOTE Reader class originated from circledump and for circle-tensordump
+//      where this class has more work to be done for stability
+//      as the tools are for developers not customores.
 
-template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T> *flat_array)
+namespace mio
+{
+namespace circle
 {
-  if (flat_array == nullptr)
-  {
-    throw std::runtime_error("flat array is nullptr");
-  }
-
-  std::vector<T> ret(flat_array->Length());
-  for (uint32_t i = 0; i < flat_array->Length(); i++)
-  {
-    ret[i] = flat_array->Get(i);
-  }
-  return ret;
-}
 
 /**
  * @brief Loads Circle file and provides helpers to access attributes
@@ -47,36 +38,39 @@ template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T
 class Reader
 {
 private:
-  using CircleSubGraphs_t = flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>;
-  using CircleBuffers_t = flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>;
-  using CircleTensors_t = flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>;
-  using CircleOperators_t = flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>;
-  using CircleMetadata_t = flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>;
-  using CircleSignatureDef_t = flatbuffers::Vector<flatbuffers::Offset<circle::SignatureDef>>;
+  using CircleSubGraphs_t = flatbuffers::Vector<flatbuffers::Offset<::circle::SubGraph>>;
+  using CircleBuffers_t = flatbuffers::Vector<flatbuffers::Offset<::circle::Buffer>>;
+  using CircleTensors_t = flatbuffers::Vector<flatbuffers::Offset<::circle::Tensor>>;
+  using CircleOperators_t = flatbuffers::Vector<flatbuffers::Offset<::circle::Operator>>;
+  using CircleMetadata_t = flatbuffers::Vector<flatbuffers::Offset<::circle::Metadata>>;
+  using CircleSignatureDef_t = flatbuffers::Vector<flatbuffers::Offset<::circle::SignatureDef>>;
 
 public:
-  Reader(const circle::Model *model);
+  Reader(const ::circle::Model *model);
 
   Reader() = delete;
 
 public:
   uint32_t version() const { return _version; }
 
-  const std::vector<const circle::OperatorCode *> &opcodes() { return _op_codes; }
+  const std::vector<const ::circle::OperatorCode *> &opcodes() { return _op_codes; }
   const CircleBuffers_t *buffers() { return _buffers; }
   const CircleTensors_t *tensors() { return _tensors; }
   const CircleOperators_t *operators() { return _operators; }
   const std::vector<int32_t> &inputs() const { return _inputs; }
   const std::vector<int32_t> &outputs() const { return _outputs; }
-  const circle::DataFormat &data_format() const { return _data_format; }
+  const ::circle::DataFormat &data_format() const { return _data_format; }
   const CircleMetadata_t *metadata() const { return _metadata; }
   const CircleSignatureDef_t *signature_defs() const { return _signature_defs; }
 
   uint32_t num_subgraph() const { return _subgraphs->Length(); }
 
   size_t buffer_info(uint32_t buf_idx, const uint8_t **buff_data);
-  circle::BuiltinOperator builtin_code(const circle::Operator *op) const;
-  std::string opcode_name(const circle::Operator *op) const;
+  ::circle::BuiltinOperator builtin_code(const ::circle::Operator *op) const;
+  std::string opcode_name(const ::circle::Operator *op) const;
+  std::vector<int32_t> outputs(const ::circle::Operator *op) const;
+  std::string tensor_name(const ::circle::Tensor *tensor) const;
+  std::string tensor_dtype(const ::circle::Tensor *tensor) const;
 
 public:
   bool select_subgraph(uint32_t subgraph);
@@ -95,12 +89,13 @@ private:
 
   uint32_t _subgraph_index = 0;
   std::string _subgraph_name;
-  std::vector<const circle::OperatorCode *> _op_codes;
+  std::vector<const ::circle::OperatorCode *> _op_codes;
   std::vector<int32_t> _inputs;
   std::vector<int32_t> _outputs;
-  circle::DataFormat _data_format = circle::DataFormat::DataFormat_CHANNELS_FIRST;
+  ::circle::DataFormat _data_format = ::circle::DataFormat::DataFormat_CHANNELS_FIRST;
 };
 
-} // namespace circleread
+} // namespace circle
+} // namespace mio
 
-#endif // __CIRCLEREAD_READ_H__
+#endif // __MIO_CIRCLE04_READER_H__
diff --git a/compiler/circle-inspect/src/Reader.cpp b/compiler/mio-circle04/src/Reader.cpp
index 0e2865254..880ffaec8 100644
--- a/compiler/circle-inspect/src/Reader.cpp
+++ b/compiler/mio-circle04/src/Reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,29 @@
  * limitations under the License.
  */
 
-#include "Reader.h"
-
-#include <mio_circle/Helper.h>
+#include "mio_circle/Reader.h"
+#include "mio_circle/Helper.h"
 
 #include <sstream>
 #include <string>
 
-namespace circleinspect
+namespace mio
+{
+namespace circle
 {
 
-Reader::Reader(const circle::Model *model)
+Reader::Reader(const ::circle::Model *model)
 {
+  if (model == nullptr)
+  {
+    throw std::runtime_error("Invalid model");
+  }
+
+  _version = model->version();
   _subgraphs = model->subgraphs();
   _buffers = model->buffers();
+  _metadata = model->metadata();
+  _signature_defs = model->signature_defs();
 
   auto opcodes = model->operator_codes();
   for (const ::circle::OperatorCode *opcode : *opcodes)
@@ -64,20 +73,20 @@ size_t Reader::buffer_info(uint32_t buf_idx, const uint8_t **buff_data)
   return 0;
 }
 
-circle::BuiltinOperator Reader::builtin_code(const circle::Operator *op) const
+::circle::BuiltinOperator Reader::builtin_code(const ::circle::Operator *op) const
 {
   uint32_t index = op->opcode_index();
   assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
+  const ::circle::OperatorCode *opcode = _op_codes.at(index);
 
   return mio::circle::builtin_code_neutral(opcode);
 }
 
-std::string Reader::opcode_name(const circle::Operator *op) const
+std::string Reader::opcode_name(const ::circle::Operator *op) const
 {
   uint32_t index = op->opcode_index();
   assert(index < _op_codes.size());
-  const circle::OperatorCode *opcode = _op_codes.at(index);
+  const ::circle::OperatorCode *opcode = _op_codes.at(index);
 
   if (!mio::circle::is_valid(opcode))
   {
@@ -89,18 +98,24 @@ std::string Reader::opcode_name(const circle::Operator *op) const
   return mio::circle::opcode_name(opcode);
 }
 
-std::string Reader::tensor_name(const circle::Tensor *tensor) const
+std::vector<int32_t> Reader::outputs(const ::circle::Operator *op) const
+{
+  return as_index_vector(op->outputs());
+}
+
+std::string Reader::tensor_name(const ::circle::Tensor *tensor) const
 {
   return mio::circle::tensor_name(tensor);
 }
 
-std::string Reader::tensor_dtype(const circle::Tensor *tensor) const
+std::string Reader::tensor_dtype(const ::circle::Tensor *tensor) const
 {
   return mio::circle::tensor_type(tensor);
 }
 
 bool Reader::select_subgraph(uint32_t sgindex)
 {
+  _subgraph_index = sgindex;
   _tensors = nullptr;
   _operators = nullptr;
 
@@ -113,10 +128,14 @@ bool Reader::select_subgraph(uint32_t sgindex)
     return false;
   }
 
-  const circle::SubGraph *subgraph = (*_subgraphs)[sgindex];
+  const ::circle::SubGraph *subgraph = (*_subgraphs)[sgindex];
+
+  auto name = subgraph->name();
+  _subgraph_name = name ? name->c_str() : "(noname)";
 
   _tensors = subgraph->tensors();
   _operators = subgraph->operators();
+  _data_format = subgraph->data_format();
 
   _inputs = as_index_vector(subgraph->inputs());
   _outputs = as_index_vector(subgraph->outputs());
@@ -124,4 +143,5 @@ bool Reader::select_subgraph(uint32_t sgindex)
   return true;
 }
 
-} // namespace circleinspect
+} // namespace circle
+} // namespace mio
diff --git a/compiler/mio-circle04/src/Reader.test.cpp b/compiler/mio-circle04/src/Reader.test.cpp
new file mode 100644
index 000000000..104454a62
--- /dev/null
+++ b/compiler/mio-circle04/src/Reader.test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mio_circle/Reader.h"
+
+#include <flatbuffers/flatbuffers.h>
+#include <gtest/gtest.h>
+
+class mio_circle04_reader_test : public ::testing::Test
+{
+protected:
+  void initialization_emty(void)
+  {
+    _model = circle::CreateModelDirect(_fbb, 0, &_opcodes_vec);
+    circle::FinishModelBuffer(_fbb, _model);
+  }
+
+  const circle::Model *circleModel(void)
+  {
+    auto ptr = _fbb.GetBufferPointer();
+    return circle::GetModel(ptr);
+  }
+
+private:
+  flatbuffers::FlatBufferBuilder _fbb;
+  flatbuffers::Offset<circle::Model> _model;
+  std::vector<flatbuffers::Offset<circle::OperatorCode>> _opcodes_vec;
+};
+
+TEST_F(mio_circle04_reader_test, null_Model_NEG)
+{
+  EXPECT_THROW(mio::circle::Reader reader(nullptr), std::runtime_error);
+}
+
+TEST_F(mio_circle04_reader_test, empty_Model)
+{
+  initialization_emty();
+
+  const circle::Model *model = circleModel();
+  EXPECT_NE(nullptr, model);
+
+  mio::circle::Reader reader(model);
+
+  SUCCEED();
+}
+
+// TODO add more tests
diff --git a/compiler/mio-tflite/README.md b/compiler/mio-tflite/README.md
index 187b1a5c6..c717ab877 100644
--- a/compiler/mio-tflite/README.md
+++ b/compiler/mio-tflite/README.md
@@ -1,3 +1,5 @@
 # mio-tflite
 
 _mio-tflite_ provides a library to access TensorFlow lite model files
+
+NOTE: _mio-tflite_ is currently obsolete
diff --git a/compiler/mio-tflite260/README.md b/compiler/mio-tflite260/README.md
index 970569b47..86d2998ed 100644
--- a/compiler/mio-tflite260/README.md
+++ b/compiler/mio-tflite260/README.md
@@ -1,3 +1,5 @@
 # mio-tflite260
 
 _mio-tflite260_ provides a library to access TensorFlow lite model files with V2.6.0.
+
+NOTE: _mio-tflite260_ is currently obsolete
diff --git a/compiler/mir/include/mir/Graph.h b/compiler/mir/include/mir/Graph.h
index bf94cfb14..37bfdb361 100644
--- a/compiler/mir/include/mir/Graph.h
+++ b/compiler/mir/include/mir/Graph.h
@@ -103,6 +103,10 @@ private:
 
 /**
  * @brief Returns nodes of the graph sorted topologically.
+ * @note  Sorting order priority
+ * 1) Graph input node (input index order)
+ * 2) Constant node (unordered - cannot predict order)
+ * 3) Ready node (unordered - cannot predict order)
  */
 std::vector<Operation *> getSortedNodes(Graph *graph);
 
diff --git a/compiler/mir/src/Graph.cpp b/compiler/mir/src/Graph.cpp
index 04b005de4..05d6dc9bd 100644
--- a/compiler/mir/src/Graph.cpp
+++ b/compiler/mir/src/Graph.cpp
@@ -44,9 +44,16 @@ std::vector<Operation *> getSortedNodes(Graph *graph)
   std::deque<Operation *> ready_nodes;
   std::unordered_map<Operation *, std::size_t> num_visited_input_edges;
 
+  // Use input vector first to maintain correct input order
+  for (Operation *op : graph->getInputs())
+  {
+    ready_nodes.push_back(op);
+  }
+
   for (Operation *op : graph->getNodes())
   {
-    if (op->getNumInputs() == 0)
+    // Skip already pushed input node
+    if ((op->getNumInputs() == 0) && (op->getType() != Operation::Type::input))
     {
       ready_nodes.push_back(op);
     }
diff --git a/compiler/mir2loco/src/mir2loco.test.cpp b/compiler/mir2loco/src/mir2loco.test.cpp
index 92ab99488..244c92aa8 100644
--- a/compiler/mir2loco/src/mir2loco.test.cpp
+++ b/compiler/mir2loco/src/mir2loco.test.cpp
@@ -383,28 +383,49 @@ TEST_F(TestTransformer_mir2loco, Conv2D_Test)
   auto loco_graph = transformer.transform(&mir_graph);
 
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
-  loco::ConstGen *const_node = dynamic_cast<loco::ConstGen *>(loco_graph->nodes()->at(1));
-  loco::FeatureEncode *encode_node =
-    dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(2));
-  loco::FilterEncode *filter_node = dynamic_cast<loco::FilterEncode *>(loco_graph->nodes()->at(3));
-  loco::Conv2D *conv_node = dynamic_cast<loco::Conv2D *>(loco_graph->nodes()->at(4));
-  loco::FeatureDecode *decode_node =
-    dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(5));
-  loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(6));
-
   ASSERT_NE(pull_node, nullptr);
+
+  // ConstGen: Only one ConstGen node
+  // We can convince that this node is input of FilterEncode because this is only ConstGen node
+  loco::ConstGen *const_node = dynamic_cast<loco::ConstGen *>(loco_graph->nodes()->at(1));
   ASSERT_NE(const_node, nullptr);
-  ASSERT_NE(filter_node, nullptr);
+
+  // FeatureEncode
+  auto pull_uses = loco::succs(pull_node);
+  ASSERT_EQ(pull_uses.size(), 1);
+  loco::FeatureEncode *encode_node = dynamic_cast<loco::FeatureEncode *>(*pull_uses.begin());
   ASSERT_NE(encode_node, nullptr);
-  ASSERT_NE(conv_node, nullptr);
-  ASSERT_NE(decode_node, nullptr);
-  ASSERT_NE(push_node, nullptr);
   ASSERT_EQ(encode_node->input(), pull_node);
-  ASSERT_EQ(filter_node->input(), const_node);
+
+  // Conv2D
+  auto encode_uses = loco::succs(encode_node);
+  ASSERT_EQ(encode_uses.size(), 1);
+  loco::Conv2D *conv_node = dynamic_cast<loco::Conv2D *>(*encode_uses.begin());
+  ASSERT_NE(conv_node, nullptr);
   ASSERT_EQ(conv_node->ifm(), encode_node);
+
+  // FilterEncode
+  auto const_uses = loco::succs(const_node);
+  ASSERT_EQ(const_uses.size(), 1);
+  loco::FilterEncode *filter_node = dynamic_cast<loco::FilterEncode *>(*const_uses.begin());
+  ASSERT_NE(filter_node, nullptr);
+  ASSERT_EQ(filter_node->input(), const_node);
   ASSERT_EQ(conv_node->ker(), filter_node);
+
+  // FeatureDecode
+  auto conv_uses = loco::succs(conv_node);
+  ASSERT_EQ(conv_uses.size(), 1);
+  loco::FeatureDecode *decode_node = dynamic_cast<loco::FeatureDecode *>(*conv_uses.begin());
+  ASSERT_NE(decode_node, nullptr);
   ASSERT_EQ(decode_node->input(), conv_node);
+
+  // Push
+  auto decode_uses = loco::succs(decode_node);
+  ASSERT_EQ(decode_uses.size(), 1);
+  loco::Push *push_node = dynamic_cast<loco::Push *>(*decode_uses.begin());
+  ASSERT_NE(push_node, nullptr);
   ASSERT_EQ(push_node->from(), decode_node);
+
   // Check params
   ASSERT_EQ(conv_node->pad()->top(), 5);
   ASSERT_EQ(conv_node->pad()->left(), 9);
diff --git a/compiler/moco/import/src/Importer.cpp b/compiler/moco/import/src/Importer.cpp
index 333f0f6a9..0659fd165 100644
--- a/compiler/moco/import/src/Importer.cpp
+++ b/compiler/moco/import/src/Importer.cpp
@@ -190,7 +190,7 @@ std::unique_ptr<loco::Graph> Importer::import(const ModelSignature &signature,
 
   convert_graph(*source_ptr, signature, tf_graph_def, graph.get());
 
-  return std::move(graph);
+  return graph;
 }
 
 } // namespace moco
diff --git a/compiler/moco/lang/src/IR/TFNode.cpp b/compiler/moco/lang/src/IR/TFNode.cpp
index 55c0e0c64..b59a505b5 100644
--- a/compiler/moco/lang/src/IR/TFNode.cpp
+++ b/compiler/moco/lang/src/IR/TFNode.cpp
@@ -17,6 +17,7 @@
 #include "moco/IR/TFNode.h"
 #include "moco/IR/TFDialect.h"
 
+#include <limits>
 #include <memory>
 #include <cassert>
 
diff --git a/compiler/one-cmds/CMakeLists.txt b/compiler/one-cmds/CMakeLists.txt
index 8732340ae..90e989a00 100644
--- a/compiler/one-cmds/CMakeLists.txt
+++ b/compiler/one-cmds/CMakeLists.txt
@@ -8,7 +8,9 @@ set(ONE_COMMAND_FILES
     one-optimize
     one-quantize
     one-pack
+    one-partition
     one-profile
+    one-infer
     one-codegen
     one-prepare-venv
     onecc
@@ -74,7 +76,11 @@ endforeach(ONE_UTILITY)
 
 # make python directory
 set(ONE_PYTHON_FILES constant.py
-                     make_cmd.py)
+                     make_cmd.py
+                     CfgRunner.py
+                     OptionBuilder.py
+                     TopologicalSortHelper.py
+                     WorkflowRunner.py)
 
 foreach(ONE_PYTHON_FILE IN ITEMS ${ONE_PYTHON_FILES})
 
diff --git a/compiler/one-cmds/dummy-driver/CMakeLists.txt b/compiler/one-cmds/dummy-driver/CMakeLists.txt
index 690a60776..2552a02db 100644
--- a/compiler/one-cmds/dummy-driver/CMakeLists.txt
+++ b/compiler/one-cmds/dummy-driver/CMakeLists.txt
@@ -1,16 +1,25 @@
 # dummy driver for interface test
 set(DUMMY_DRIVER_SRC src/dummy-compile.cpp)
 set(HELP_DRIVER_SRC src/help-compile.cpp)
+set(DUMMY_INFER_SRC src/dummy-infer.cpp)
+set(DUMMY_INFER_V2_SRC src/dummy-inferV2.cpp)
+set(HELP_INFER_SRC src/help-infer.cpp)
 set(DUMMY_PROFILE_SRC src/dummy-profile.cpp)
 set(HELP_PROFILE_SRC src/help-profile.cpp)
 
 add_executable(dummy-compile ${DUMMY_DRIVER_SRC})
 add_executable(help-compile ${HELP_DRIVER_SRC})
+add_executable(dummy-infer ${DUMMY_INFER_SRC})
+add_executable(dummy-inferV2 ${DUMMY_INFER_V2_SRC})
+add_executable(help-infer ${HELP_INFER_SRC})
 add_executable(dummy-profile ${DUMMY_PROFILE_SRC})
 add_executable(help-profile ${HELP_PROFILE_SRC})
 
 set(DUMMY_DRIVER "${CMAKE_CURRENT_BINARY_DIR}/dummy-compile")
 set(HELP_DRIVER "${CMAKE_CURRENT_BINARY_DIR}/help-compile")
+set(DUMMY_INFER "${CMAKE_CURRENT_BINARY_DIR}/dummy-infer")
+set(DUMMY_INFER_V2 "${CMAKE_CURRENT_BINARY_DIR}/dummy-inferV2")
+set(HELP_INFER "${CMAKE_CURRENT_BINARY_DIR}/help-infer")
 set(DUMMY_PROFILE "${CMAKE_CURRENT_BINARY_DIR}/dummy-profile")
 set(HELP_PROFILE "${CMAKE_CURRENT_BINARY_DIR}/help-profile")
 
@@ -26,6 +35,24 @@ install(FILES ${HELP_DRIVER}
                     WORLD_READ WORLD_EXECUTE
         DESTINATION test)
 
+install(FILES ${DUMMY_INFER}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
+install(FILES ${DUMMY_INFER_V2}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
+install(FILES ${HELP_INFER}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
 install(FILES ${DUMMY_PROFILE}
         PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
                     GROUP_READ GROUP_EXECUTE
diff --git a/compiler/one-cmds/dummy-driver/src/dummy-infer.cpp b/compiler/one-cmds/dummy-driver/src/dummy-infer.cpp
new file mode 100644
index 000000000..60f5faefa
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/src/dummy-infer.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * dummy-infer only tests its interface rather than its functionality.
+ *
+ * ./dummy-infer ${INPUT_NAME}
+ * dummy-infer dummy output!!!
+ */
+
+#include <iostream>
+
+int main(int argc, char **argv)
+{
+  if (argc != 2)
+    return EXIT_FAILURE;
+
+  std::cout << "dummy-infer dummy output!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/one-cmds/dummy-driver/src/dummy-inferV2.cpp b/compiler/one-cmds/dummy-driver/src/dummy-inferV2.cpp
new file mode 100644
index 000000000..4b93c70a3
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/src/dummy-inferV2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * dummy-infer only tests its interface rather than its functionality.
+ *
+ * ./dummy-infer ${INPUT_NAME}
+ * Do inference of ${INPUT_NAME}
+ */
+
+#include <iostream>
+
+int main(int argc, char **argv)
+{
+  if (argc != 2)
+    return EXIT_FAILURE;
+
+  std::cout << "Do inference of " + std::string(argv[1]) << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/one-cmds/dummy-driver/src/help-infer.cpp b/compiler/one-cmds/dummy-driver/src/help-infer.cpp
new file mode 100644
index 000000000..821d368d4
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/src/help-infer.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * help-infer prints dummy help message.
+ *
+ * $ ./help-infer -h
+ * HELP MESSAGE!!
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+int main(int argc, char **argv)
+{
+  if (argc != 2)
+    return EXIT_FAILURE;
+
+  std::string opt_h{"-h"};
+  std::string argv_1{argv[1]};
+
+  if (opt_h != argv_1)
+    return EXIT_FAILURE;
+
+  std::cout << "HELP MESSAGE!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index ebc165167..2352bbd7a 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -153,6 +153,7 @@ Current transformation options are
 - expand_broadcast_const : This will expand broadcastable constant node inputs
 - fold_add_v2 : This removes AddV2 operation which can be folded
 - fold_cast : This removes Cast operation which can be folded
+- fold_densify: This removes Densify operator which can be folded
 - fold_dequantize : This removes Dequantize operation which can be folded
 - fold_dwconv : This folds Depthwise Convolution operation which can be folded
 - fold_gather : This removes Gather operation which can be folded
@@ -205,10 +206,6 @@ Current transformation options are
 - transform_min_max_to_relu6: This will transform Minimum-Maximum pattern to Relu6 operator.
 - transform_min_relu_to_relu6: This will transform Minimum(6)-Relu pattern to Relu6 operator.
 
-There are options to enable multiple options at once for convenience.
-- O1: fuse_bcq, fuse_instnorm, resolve_customop_add, resolve_customop_batchmatmul,
-  resolve_customop_matmul, remove_redundant_transpose, substitute_pack_to_reshape
-
 
 one-quantize
 ------------
diff --git a/compiler/one-cmds/one-build b/compiler/one-cmds/one-build
index 5c313b44b..4b1f98070 100644
--- a/compiler/one-cmds/one-build
+++ b/compiler/one-cmds/one-build
@@ -22,7 +22,6 @@
 import argparse
 import configparser
 import os
-import subprocess
 import sys
 
 import utils as _utils
@@ -83,6 +82,7 @@ def _get_driver_name(driver_name):
         'one-import-onnx': 'one-import-onnx',
         'one-optimize': 'one-optimize',
         'one-quantize': 'one-quantize',
+        'one-partition': 'one-partition',
         'one-pack': 'one-pack',
         'one-codegen': 'one-codegen'
     }[driver_name]
@@ -157,7 +157,8 @@ def main():
     bin_dir = os.path.dirname(os.path.realpath(__file__))
     import_drivers_dict = _utils._detect_one_import_drivers(bin_dir)
     transform_drivers = [
-        'one-optimize', 'one-quantize', 'one-pack', 'one-codegen', 'one-profile'
+        'one-optimize', 'one-quantize', 'one-pack', 'one-codegen', 'one-profile',
+        'one-partition'
     ]
     _verify_cfg(import_drivers_dict, config)
 
diff --git a/compiler/one-cmds/one-build.template.cfg b/compiler/one-cmds/one-build.template.cfg
index e147896ef..42960811e 100644
--- a/compiler/one-cmds/one-build.template.cfg
+++ b/compiler/one-cmds/one-build.template.cfg
@@ -5,6 +5,7 @@ one-import-bcq=False
 one-import-onnx=False
 one-optimize=True
 one-quantize=False
+one-parition=False
 one-pack=True
 one-codegen=False
 
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index 726538d44..86e1632e6 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -25,9 +25,7 @@ import glob
 import itertools
 import ntpath
 import os
-import subprocess
 import sys
-import tempfile
 import shutil
 
 import utils as _utils
diff --git a/compiler/one-cmds/one-import-bcq b/compiler/one-cmds/one-import-bcq
index ef89a9297..c3ef0b275 100644
--- a/compiler/one-cmds/one-import-bcq
+++ b/compiler/one-cmds/one-import-bcq
@@ -21,7 +21,6 @@
 
 import argparse
 import os
-import subprocess
 import sys
 import tempfile
 
@@ -160,9 +159,9 @@ def _convert(args):
             tmpdir,
             os.path.splitext(
                 os.path.basename(generate_bcq_metadata_output_path))[0]) + '.tflite'
-        tf2tfliteV2_cmd = _make_cmd.make_tf2tfliteV2_cmd(args, tf2tfliteV2_path,
-                                                       generate_bcq_metadata_output_path,
-                                                       tf2tfliteV2_output_path)
+        tf2tfliteV2_cmd = _make_cmd.make_tf2tfliteV2_cmd(
+            args, tf2tfliteV2_path, generate_bcq_metadata_output_path,
+            tf2tfliteV2_output_path)
         try:
             output_arrays_idx = tf2tfliteV2_cmd.index('--output_arrays')
             tf2tfliteV2_cmd[output_arrays_idx + 1] = ','.join(bcq_output_arrays)
@@ -177,8 +176,8 @@ def _convert(args):
         # make a command to convert from tflite to circle
         tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
         tflite2circle_cmd = _make_cmd.make_tflite2circle_cmd(tflite2circle_path,
-                                                           tf2tfliteV2_output_path,
-                                                           getattr(args, 'output_path'))
+                                                             tf2tfliteV2_output_path,
+                                                             getattr(args, 'output_path'))
 
         f.write((' '.join(tflite2circle_cmd) + '\n').encode())
 
diff --git a/compiler/one-cmds/one-import-onnx b/compiler/one-cmds/one-import-onnx
index eaa136197..ad19c2f59 100644
--- a/compiler/one-cmds/one-import-onnx
+++ b/compiler/one-cmds/one-import-onnx
@@ -21,7 +21,6 @@
 
 import argparse
 import os
-import subprocess
 import sys
 import tempfile
 import onnx
@@ -80,6 +79,12 @@ def _get_parser():
     parser.add_argument('--unroll_rnn', action='store_true', help='Unroll RNN operators')
     parser.add_argument(
         '--unroll_lstm', action='store_true', help='Unroll LSTM operators')
+    parser.add_argument(
+        '--keep_io_order',
+        action='store_true',
+        help=
+        'Ensure generated circle model preserves the I/O order of the original onnx model.'
+    )
 
     # save intermediate file(s)
     parser.add_argument(
@@ -87,6 +92,12 @@ def _get_parser():
         action='store_true',
         help='Save intermediate files to output folder')
 
+    # experimental options
+    parser.add_argument(
+        '--experimental_disable_batchmatmul_unfold',
+        action='store_true',
+        help='Experimental disable BatchMatMul unfold')
+
     return parser
 
 
@@ -124,6 +135,65 @@ def _apply_verbosity(verbosity):
         os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
 
+# The index of input/output is added in front of the name. For example,
+# Original input names: 'a', 'c', 'b'
+# Renamed: '0001_a', '0002_c', '0003_b'
+# This will preserve I/O order after import.
+def _remap_io_names(onnx_model):
+    # gather existing name of I/O and generate new name of I/O in sort order
+    input_nodes = []
+    output_nodes = []
+    remap_inputs = []
+    remap_outputs = []
+    initializers = []
+    # some models may have initializers as inputs. ignore them.
+    for initializer in onnx_model.graph.initializer:
+        initializers.append(initializer.name)
+    for idx in range(0, len(onnx_model.graph.input)):
+        name = onnx_model.graph.input[idx].name
+        if not name in initializers:
+            input_nodes.append(name)
+            remap_inputs.append(format(idx + 1, '04d') + '_' + name)
+    for idx in range(0, len(onnx_model.graph.output)):
+        name = onnx_model.graph.output[idx].name
+        output_nodes.append(name)
+        remap_outputs.append(format(idx + 1, '04d') + '_' + name)
+    # change names for graph input
+    for i in range(len(onnx_model.graph.input)):
+        if onnx_model.graph.input[i].name in input_nodes:
+            to_rename = onnx_model.graph.input[i].name
+            idx = input_nodes.index(to_rename)
+            onnx_model.graph.input[i].name = remap_inputs[idx]
+    # change names of all nodes in the graph
+    for i in range(len(onnx_model.graph.node)):
+        # check node.input is to change to remap_inputs or remap_outputs
+        for j in range(len(onnx_model.graph.node[i].input)):
+            if onnx_model.graph.node[i].input[j] in input_nodes:
+                to_rename = onnx_model.graph.node[i].input[j]
+                idx = input_nodes.index(to_rename)
+                onnx_model.graph.node[i].input[j] = remap_inputs[idx]
+            if onnx_model.graph.node[i].input[j] in output_nodes:
+                to_rename = onnx_model.graph.node[i].input[j]
+                idx = output_nodes.index(to_rename)
+                onnx_model.graph.node[i].input[j] = remap_outputs[idx]
+        # check node.output is to change to remap_inputs or remap_outputs
+        for j in range(len(onnx_model.graph.node[i].output)):
+            if onnx_model.graph.node[i].output[j] in output_nodes:
+                to_rename = onnx_model.graph.node[i].output[j]
+                idx = output_nodes.index(to_rename)
+                onnx_model.graph.node[i].output[j] = remap_outputs[idx]
+            if onnx_model.graph.node[i].output[j] in input_nodes:
+                to_rename = onnx_model.graph.node[i].output[j]
+                idx = input_nodes.index(to_rename)
+                onnx_model.graph.node[i].output[j] = remap_inputs[idx]
+    # change names for graph output
+    for i in range(len(onnx_model.graph.output)):
+        if onnx_model.graph.output[i].name in output_nodes:
+            to_rename = onnx_model.graph.output[i].name
+            idx = output_nodes.index(to_rename)
+            onnx_model.graph.output[i].name = remap_outputs[idx]
+
+
 def _convert(args):
     _apply_verbosity(args.verbose)
 
@@ -142,6 +212,13 @@ def _convert(args):
             options.unroll_rnn = _utils._is_valid_attr(args, 'unroll_rnn')
             options.unroll_lstm = _utils._is_valid_attr(args, 'unroll_lstm')
             onnx_legalizer.legalize(onnx_model, options)
+        if _utils._is_valid_attr(args, 'keep_io_order'):
+            _remap_io_names(onnx_model)
+            if _utils._is_valid_attr(args, 'save_intermediate'):
+                basename = os.path.basename(getattr(args, 'input_path'))
+                fixed_path = os.path.join(tmpdir,
+                                          os.path.splitext(basename)[0] + '~.onnx')
+                onnx.save(onnx_model, fixed_path)
         tf_savedmodel = onnx_tf.backend.prepare(onnx_model)
 
         savedmodel_name = os.path.splitext(os.path.basename(
@@ -166,8 +243,8 @@ def _convert(args):
         # make a command to convert from tflite to circle
         tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
         tflite2circle_cmd = _make_cmd.make_tflite2circle_cmd(tflite2circle_path,
-                                                           tf2tfliteV2_output_path,
-                                                           getattr(args, 'output_path'))
+                                                             tf2tfliteV2_output_path,
+                                                             getattr(args, 'output_path'))
 
         f.write((' '.join(tflite2circle_cmd) + '\n').encode())
 
diff --git a/compiler/one-cmds/one-import-pytorch b/compiler/one-cmds/one-import-pytorch
index dbf1ba6d7..7f39e61bb 100644
--- a/compiler/one-cmds/one-import-pytorch
+++ b/compiler/one-cmds/one-import-pytorch
@@ -80,7 +80,8 @@ def _get_parser():
     tf2tflite_group.add_argument('--converter_version', default='v2')
 
     parser.add_argument('--unroll_rnn', action='store_true', help='Unroll RNN operators')
-    parser.add_argument('--unroll_lstm', action='store_true', help='Unroll LSTM operators')
+    parser.add_argument(
+        '--unroll_lstm', action='store_true', help='Unroll LSTM operators')
 
     # save intermediate file(s)
     parser.add_argument(
@@ -338,8 +339,8 @@ def _convert(args):
         # make a command to convert from tflite to circle
         tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
         tflite2circle_cmd = _make_cmd.make_tflite2circle_cmd(tflite2circle_path,
-                                                           tf2tfliteV2_output_path,
-                                                           getattr(args, 'output_path'))
+                                                             tf2tfliteV2_output_path,
+                                                             getattr(args, 'output_path'))
 
         f.write((' '.join(tflite2circle_cmd) + '\n').encode())
 
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index 999255a34..6623fa6a4 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -21,8 +21,6 @@
 
 import argparse
 import os
-import subprocess
-import sys
 import tempfile
 
 import onelib.make_cmd as _make_cmd
@@ -152,8 +150,8 @@ def _convert(args):
             tmpdir,
             os.path.splitext(os.path.basename(args.output_path))[0]) + '.tflite'
         tf2tfliteV2_cmd = _make_cmd.make_tf2tfliteV2_cmd(args, tf2tfliteV2_path,
-                                                       getattr(args, 'input_path'),
-                                                       tf2tfliteV2_output_path)
+                                                         getattr(args, 'input_path'),
+                                                         tf2tfliteV2_output_path)
 
         f.write((' '.join(tf2tfliteV2_cmd) + '\n').encode())
 
@@ -163,8 +161,8 @@ def _convert(args):
         # make a command to convert from tflite to circle
         tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
         tflite2circle_cmd = _make_cmd.make_tflite2circle_cmd(tflite2circle_path,
-                                                           tf2tfliteV2_output_path,
-                                                           getattr(args, 'output_path'))
+                                                             tf2tfliteV2_output_path,
+                                                             getattr(args, 'output_path'))
 
         f.write((' '.join(tflite2circle_cmd) + '\n').encode())
 
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index 2d756bff6..3d96b117f 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -21,7 +21,6 @@
 
 import argparse
 import os
-import subprocess
 import sys
 
 import onelib.make_cmd as _make_cmd
@@ -83,8 +82,8 @@ def _convert(args):
         # make a command to convert from tflite to circle
         tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
         tflite2circle_cmd = _make_cmd.make_tflite2circle_cmd(tflite2circle_path,
-                                                           getattr(args, 'input_path'),
-                                                           getattr(args, 'output_path'))
+                                                             getattr(args, 'input_path'),
+                                                             getattr(args, 'output_path'))
 
         f.write((' '.join(tflite2circle_cmd) + '\n').encode())
 
diff --git a/compiler/one-cmds/one-infer b/compiler/one-cmds/one-infer
new file mode 100644
index 000000000..c7fcd8afd
--- /dev/null
+++ b/compiler/one-cmds/one-infer
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+''''export SCRIPT_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)" # '''
+''''export PY_PATH=${SCRIPT_PATH}/venv/bin/python                                       # '''
+''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
+''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import glob
+import itertools
+import ntpath
+import os
+import sys
+
+import utils as _utils
+
+# TODO Find better way to suppress trackback on error
+sys.tracebacklimit = 0
+
+
+def _get_backends_list():
+    """
+    [one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    └── test
+
+    The list where `one-infer` finds its backends
+    - `bin` folder where `one-infer` exists
+    - `backends` folder
+
+    NOTE If there are backends of the same name in different places,
+     the closer to the top in the list, the higher the priority.
+    """
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    backend_set = set()
+
+    # bin folder
+    files = [f for f in glob.glob(dir_path + '/*-infer')]
+    # backends folder
+    files += [f for f in glob.glob(dir_path + '/../backends/**/*-infer', recursive=True)]
+    # TODO find backends in `$PATH`
+
+    backends_list = []
+    for cand in files:
+        base = ntpath.basename(cand)
+        if (not base in backend_set) and os.path.isfile(cand) and os.access(
+                cand, os.X_OK):
+            backend_set.add(base)
+            backends_list.append(cand)
+
+    return backends_list
+
+
+def _search_backend_driver(driver):
+    """
+    [one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    └── test
+
+    The list where `one-infer` finds its backend driver
+    - `bin` folder where `one-infer` exists
+    - `backends/**/bin/` folder
+
+    NOTE If there are drivers of the same name in different places,
+     the closer to the top in the list, the higher the priority.
+    """
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+
+    # CASE 1: one/bin/{driver} is found
+    driver_path = dir_path + '/' + driver
+    if os.path.isfile(driver_path) and os.access(driver_path, os.X_OK):
+        return driver_path
+
+    # CASE 2: one/backends/**/bin/{driver} is found
+    for driver_path in glob.glob(
+            dir_path + '/../backends/**/bin/' + driver, recursive=True):
+        if os.path.isfile(driver_path) and os.access(driver_path, os.X_OK):
+            return driver_path
+
+    # CASE 3: {driver} is found in nowhere
+    return None
+
+
+def _get_parser(backends_list):
+    infer_usage = 'one-infer [-h] [-v] [-C CONFIG] [-d DRIVER | -b BACKEND] [--post-process POST_PROCESS] [--] [COMMANDS FOR BACKEND DRIVER]'
+    parser = argparse.ArgumentParser(
+        description='command line tool to infer model', usage=infer_usage)
+
+    _utils._add_default_arg(parser)
+
+    # TODO: add tflite/onnx-infer driver to helper message when it is implemented
+    driver_help_message = 'backend inference driver name to execute'
+    parser.add_argument('-d', '--driver', type=str, help=driver_help_message)
+
+    # get backend list in the directory
+    backends_name = [ntpath.basename(f) for f in backends_list]
+    if not backends_name:
+        backends_name_message = '(There is no available backend drivers)'
+    else:
+        backends_name_message = '(available backend drivers: ' + ', '.join(
+            backends_name) + ')'
+    backend_help_message = 'backend name to use ' + backends_name_message
+    parser.add_argument('-b', '--backend', type=str, help=backend_help_message)
+
+    post_process_help_message = 'post processing script to convert I/O data to standard format'
+    parser.add_argument('--post-process', type=str, help=post_process_help_message)
+
+    return parser
+
+
+def _verify_arg(parser, args):
+    """verify given arguments"""
+    # `-d/--driver` and `-b/--backend` are mutually exclusive arguments.
+    if _utils._is_valid_attr(args, 'driver') and _utils._is_valid_attr(args, 'backend'):
+        parser.error(
+            '-d and -b options are mutually exclusive. Please use only one of them')
+
+    missing = []
+    if not _utils._is_valid_attr(args, 'driver') and not _utils._is_valid_attr(
+            args, 'backend'):
+        missing.append('{-d/--driver | -b/--backend}')
+    if len(missing):
+        parser.error('the following arguments are required: ' + ' '.join(missing))
+
+
+def _parse_arg(parser):
+    infer_args = []
+    backend_args = []
+    argv = copy.deepcopy(sys.argv)
+    # delete file name
+    del argv[0]
+    # split by '--'
+    args = [list(y) for x, y in itertools.groupby(argv, lambda z: z == '--') if not x]
+
+    # one-infer [-h] [-v] [-C CONFIG] [-d DRIVER] [-b BACKEND] [--post-process POST_PROCESS] -- [COMMANDS FOR BACKEND DRIVER]
+    if len(args):
+        infer_args = args[0]
+        infer_args = parser.parse_args(infer_args)
+        backend_args = backend_args if len(args) < 2 else args[1]
+    # print version
+    if len(args) and infer_args.version:
+        _utils._print_version_and_exit(__file__)
+
+    return infer_args, backend_args
+
+
+def _get_executable(args, backends_list):
+    driver = _utils._is_valid_attr(args, 'driver')
+    if driver:
+        executable = _search_backend_driver(driver)
+        if executable:
+            return executable
+        else:
+            raise FileNotFoundError(driver + ' not found')
+
+    if _utils._is_valid_attr(args, 'backend'):
+        backend_base = getattr(args, 'backend') + '-infer'
+        for cand in backends_list:
+            if ntpath.basename(cand) == backend_base:
+                return cand
+        raise FileNotFoundError(backend_base + ' not found')
+
+
+def main():
+    # get backend list
+    backends_list = _get_backends_list()
+
+    # parse arguments
+    parser = _get_parser(backends_list)
+    args, backend_args = _parse_arg(parser)
+
+    # parse configuration file
+    _utils._parse_cfg(args, 'one-infer')
+
+    # verify arguments
+    _verify_arg(parser, args)
+
+    # make a command to run given backend driver
+    driver_path = _get_executable(args, backends_list)
+    infer_cmd = [driver_path] + backend_args
+    if _utils._is_valid_attr(args, 'command'):
+        infer_cmd += getattr(args, 'command').split()
+
+    # run backend driver
+    _utils._run(infer_cmd, err_prefix=ntpath.basename(driver_path))
+
+    # run post process script if it's given
+    if _utils._is_valid_attr(args, 'post_process'):
+        # NOTE: the given python script will be executed by venv of ONE
+        python_path = sys.executable
+        post_process_command = [python_path] + getattr(args,
+                                                       'post_process').strip().split(' ')
+        _utils._run(post_process_command, err_prefix='one-infer')
+
+
+if __name__ == '__main__':
+    _utils._safemain(main, __file__)
diff --git a/compiler/one-cmds/one-init b/compiler/one-cmds/one-init
new file mode 100644
index 000000000..04c4534cd
--- /dev/null
+++ b/compiler/one-cmds/one-init
@@ -0,0 +1,280 @@
+#!/usr/bin/env bash
+''''export SCRIPT_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)" # '''
+''''export PY_PATH=${SCRIPT_PATH}/venv/bin/python                                       # '''
+''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
+''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import glob
+import itertools
+import ntpath
+import os
+import sys
+
+import configparser
+import utils as _utils
+
+# TODO Find better way to suppress trackback on error
+sys.tracebacklimit = 0
+
+
+class CommentableConfigParser(configparser.ConfigParser):
+    """
+    ConfigParser where comment can be stored
+    In Python ConfigParser, comment in ini file ( starting with ';') is considered a key of which
+    value is None.
+    Ref: https://stackoverflow.com/questions/6620637/writing-comments-to-files-with-configparser
+    """
+
+    def __init__(self):
+        # allow_no_value=True to add comment
+        # ref: https://stackoverflow.com/a/19432072
+        configparser.ConfigParser.__init__(self, allow_no_value=True)
+        self.optionxform = str
+
+    def add_comment(self, section, comment):
+        comment_sign = ';'
+        self[section][f'{comment_sign} {comment}'] = None
+
+
+def _get_backends_list():
+    """
+    [one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    └── test
+
+    The list where `one-init` finds its backends
+    - `bin` folder where `one-init` exists
+    - `backends` folder
+
+    NOTE If there are backends of the same name in different places,
+     the closer to the top in the list, the higher the priority.
+    """
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    backend_set = set()
+
+    # bin folder
+    files = [f for f in glob.glob(dir_path + '/*-init')]
+    # backends folder
+    files += [f for f in glob.glob(dir_path + '/../backends/**/*-init', recursive=True)]
+    # TODO find backends in `$PATH`
+
+    backends_list = []
+    for cand in files:
+        base = ntpath.basename(cand)
+        if (not base in backend_set) and os.path.isfile(cand) and os.access(
+                cand, os.X_OK):
+            backend_set.add(base)
+            backends_list.append(cand)
+
+    return backends_list
+
+
+# TODO Add support for TF graphdef and bcq
+def _get_parser(backends_list):
+    init_usage = (
+        'one-init [-h] [-v] [-V] '
+        '[-i INPUT_PATH] '
+        '[-o OUTPUT_PATH] '
+        '[-m MODEL_TYPE] '
+        '[-b BACKEND] '
+        # args for onnx model
+        '[--convert_nchw_to_nhwc] '
+        '[--nchw_to_nhwc_input_shape] '
+        '[--nchw_to_nhwc_output_shape] '
+        # args for backend driver
+        '[--] [COMMANDS FOR BACKEND DRIVER]')
+    """
+    NOTE
+    layout options for onnx model could be difficult to users.
+    In one-init, we could consider easier args for the the above three:
+    For example, we could have another option, e.g., --input_img_layout LAYOUT
+      - When LAYOUT is NHWC, apply 'nchw_to_nhwc_input_shape=True' into cfg
+      - When LAYOUT is NCHW, apply 'nchw_to_nhwc_input_shape=False' into cfg
+    """
+
+    parser = argparse.ArgumentParser(
+        description='Command line tool to generate initial cfg file. '
+        'Currently tflite and onnx models are supported',
+        usage=init_usage)
+
+    _utils._add_default_arg_no_CS(parser)
+
+    parser.add_argument(
+        '-i', '--input_path', type=str, help='full filepath of the input model file')
+    parser.add_argument(
+        '-o', '--output_path', type=str, help='full filepath of the output cfg file')
+    parser.add_argument(
+        '-m',
+        '--model_type',
+        type=str,
+        help=('type of input model: "onnx", "tflite". '
+              'If the file extension passed to --input_path is '
+              '".tflite" or ".onnx", this arg can be omitted.'))
+
+    onnx_group = parser.add_argument_group('arguments when model type is onnx')
+    onnx_group.add_argument(
+        '--convert_nchw_to_nhwc',
+        action='store_true',
+        help=
+        'Convert NCHW operators to NHWC under the assumption that input model is NCHW.')
+    onnx_group.add_argument(
+        '--nchw_to_nhwc_input_shape',
+        action='store_true',
+        help='Convert the input shape of the model (argument for convert_nchw_to_nhwc)')
+    onnx_group.add_argument(
+        '--nchw_to_nhwc_output_shape',
+        action='store_true',
+        help='Convert the output shape of the model (argument for convert_nchw_to_nhwc)')
+
+    # get backend list in the directory
+    backends_name = [ntpath.basename(f) for f in backends_list]
+    if not backends_name:
+        backends_name_message = '(There is no available backend drivers)'
+    else:
+        backends_name_message = '(available backend drivers: ' + ', '.join(
+            backends_name) + ')'
+    backend_help_message = 'backend name to use ' + backends_name_message
+    parser.add_argument('-b', '--backend', type=str, help=backend_help_message)
+
+    return parser
+
+
+def _verify_arg(parser, args):
+    # check if required arguments is given
+    missing = []
+    if not _utils._is_valid_attr(args, 'input_path'):
+        missing.append('-i/--input_path')
+    if not _utils._is_valid_attr(args, 'output_path'):
+        missing.append('-o/--output_path')
+    if not _utils._is_valid_attr(args, 'backend'):
+        missing.append('-b/--backend')
+
+    if _utils._is_valid_attr(args, 'model_type'):
+        # TODO Support model types other than onnx and tflite (e.g., TF)
+        if getattr(args, 'model_type') not in ['onnx', 'tflite']:
+            parser.error('Allowed value for --model_type: "onnx" or "tflite"')
+
+    if _utils._is_valid_attr(args, 'nchw_to_nhwc_input_shape'):
+        if not _utils._is_valid_attr(args, 'convert_nchw_to_nhwc'):
+            missing.append('--convert_nchw_to_nhwc')
+    if _utils._is_valid_attr(args, 'nchw_to_nhwc_output_shape'):
+        if not _utils._is_valid_attr(args, 'convert_nchw_to_nhwc'):
+            missing.append('--convert_nchw_to_nhwc')
+
+    if len(missing):
+        parser.error('the following arguments are required: ' + ' '.join(missing))
+
+
+def _parse_arg(parser):
+    init_args = []
+    backend_args = []
+    argv = copy.deepcopy(sys.argv)
+    # delete file name
+    del argv[0]
+    # split by '--'
+    args = [list(y) for x, y in itertools.groupby(argv, lambda z: z == '--') if not x]
+
+    # one-init [-h] [-v] ...
+    if len(args):
+        init_args = args[0]
+        init_args = parser.parse_args(init_args)
+        backend_args = backend_args if len(args) < 2 else args[1]
+    # print version
+    if len(args) and init_args.version:
+        _utils._print_version_and_exit(__file__)
+
+    return init_args, backend_args
+
+
+def _get_executable(args, backends_list):
+    if _utils._is_valid_attr(args, 'backend'):
+        backend_base = getattr(args, 'backend') + '-init'
+        for cand in backends_list:
+            if ntpath.basename(cand) == backend_base:
+                return cand
+        raise FileNotFoundError(backend_base + ' not found')
+
+
+# TODO Support workflow format (https://github.com/Samsung/ONE/pull/9354)
+def _generate():
+    # generate cfg file
+    config = CommentableConfigParser()
+
+    def _add_onecc_sections():
+        pass  # NYI
+
+    def _gen_import():
+        pass  # NYI
+
+    def _gen_optimize():
+        pass  # NYI
+
+    def _gen_quantize():
+        pass  # NYI
+
+    def _gen_codegen():
+        pass  # NYI
+
+    #
+    # NYI: one-profile, one-partition, one-pack, one-infer
+    #
+
+    _add_onecc_sections()
+
+    _gen_import()
+    _gen_optimize()
+    _gen_quantize()
+    _gen_codegen()
+
+    with open(args.output_path, 'w') as f:
+        config.write(f)
+
+
+def main():
+    # get backend list
+    backends_list = _get_backends_list()
+
+    # parse arguments
+    parser = _get_parser(backends_list)
+    args, backend_args = _parse_arg(parser)
+
+    # verify arguments
+    _verify_arg(parser, args)
+
+    # make a command to run given backend driver
+    driver_path = _get_executable(args, backends_list)
+    init_cmd = [driver_path] + backend_args
+
+    # run backend driver
+    _utils._run(init_cmd, err_prefix=ntpath.basename(driver_path))
+
+    #TODO generate cfg file
+
+    raise NotImplementedError("NYI")
+
+
+if __name__ == '__main__':
+    _utils._safemain(main, __file__)
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 8b1f3f7be..481fc8459 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -21,7 +21,6 @@
 
 import argparse
 import os
-import subprocess
 import sys
 
 import onelib.constant as _constant
@@ -83,6 +82,14 @@ def _verify_arg(parser, args):
     if len(missing):
         parser.error('the following arguments are required: ' + ' '.join(missing))
 
+    # default has pre-defined optimization options
+    default = _get_parser().parse_args()
+
+    # check if unrecognized arguments are given
+    diff = set(dir(args)) - set(dir(default))
+    if len(diff):
+        parser.error('the following arguments are unrecognized: ' + ' '.join(diff))
+
 
 def _parse_arg(parser):
     args = parser.parse_args()
@@ -102,8 +109,8 @@ def _optimize(args):
         # make a command to optimize circle model
         circle2circle_path = os.path.join(dir_path, 'circle2circle')
         circle2circle_cmd = _make_cmd.make_circle2circle_cmd(args, circle2circle_path,
-                                                           getattr(args, 'input_path'),
-                                                           getattr(args, 'output_path'))
+                                                             getattr(args, 'input_path'),
+                                                             getattr(args, 'output_path'))
 
         # verbose
         if _utils._is_valid_attr(args, 'verbose'):
diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
index 133207de0..5cab7c737 100644
--- a/compiler/one-cmds/one-pack
+++ b/compiler/one-cmds/one-pack
@@ -21,9 +21,7 @@
 
 import argparse
 import os
-import subprocess
 import sys
-import tempfile
 
 import utils as _utils
 
diff --git a/compiler/one-cmds/one-partition b/compiler/one-cmds/one-partition
new file mode 100644
index 000000000..c0d71e5d9
--- /dev/null
+++ b/compiler/one-cmds/one-partition
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+''''export SCRIPT_PATH="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)" # '''
+''''export PY_PATH=${SCRIPT_PATH}/venv/bin/python                                       # '''
+''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
+''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import configparser
+import os
+import sys
+
+import utils as _utils
+
+# TODO Find better way to suppress trackback on error
+sys.tracebacklimit = 0
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(
+        description='command line tool to partition circle model by multiple backends')
+
+    _utils._add_default_arg(parser)
+
+    parser.add_argument(
+        '--backends', type=str, help='backends in CSV to use for partitioning')
+    parser.add_argument('--default', type=str, help='default backend to assign')
+
+    parser.add_argument(
+        '--part_file', type=str, help='partition file which provides backend to assign')
+    parser.add_argument('--input_file', type=str, help='input circle model filename')
+    parser.add_argument(
+        '--work_path',
+        type=str,
+        help='work path of partition, input files exist and output files are produced')
+
+    return parser
+
+
+def _parse_arg(parser):
+    args = parser.parse_args()
+    # print version
+    if args.version:
+        _utils._print_version_and_exit(__file__)
+
+    return args
+
+
+def _verify_arg(parser, args):
+    """verify given arguments"""
+    # check if required arguments is given
+    missing = []
+    if not _utils._is_valid_attr(args, 'part_file'):
+        missing.append('part_file')
+    if not _utils._is_valid_attr(args, 'input_file'):
+        missing.append('input_file')
+    if len(missing):
+        parser.error('the following arguments are required: ' + ' '.join(missing))
+    return
+
+
+def _partition(args):
+    # get file path to log
+    bin_path = os.path.dirname(os.path.realpath(__file__))
+    cur_path = os.getcwd()
+    partition_path = os.path.join(cur_path, args.part_file)
+    logfile_path = partition_path + '.log'
+
+    with open(logfile_path, 'wb', buffering=0) as f:
+        # make a command to package circle model and metadata into nnpackage
+        circle_partitioner_path = os.path.join(bin_path, 'circle-partitioner')
+
+        cmd = [os.path.expanduser(circle_partitioner_path)]
+
+        if _utils._is_valid_attr(args, 'backends'):
+            cmd.append('--backends')
+            cmd.append(getattr(args, 'backends'))
+        if _utils._is_valid_attr(args, 'default'):
+            cmd.append('--default')
+            cmd.append(getattr(args, 'default'))
+        if _utils._is_valid_attr(args, 'work_path'):
+            cmd.append('--work_path')
+            cmd.append(getattr(args, 'work_path'))
+
+        cmd.append('--part_file')
+        cmd.append(args.part_file)
+        cmd.append('--input_file')
+        cmd.append(args.input_file)
+
+        f.write((' '.join(cmd) + '\n').encode())
+
+        # run circle-partitoner
+        _utils._run(cmd, err_prefix='circle-partitioner', logfile=f)
+
+
+def main():
+    # parse arguments
+    parser = _get_parser()
+    args = _parse_arg(parser)
+
+    # parse configuration file
+    _utils._parse_cfg(args, 'one-partition')
+
+    if _utils._is_valid_attr(args, 'config'):
+        config_path = getattr(args, 'config')
+        _utils._parse_cfg_and_overwrite(config_path, 'one-partition', args)
+
+    # verify arguments
+    _verify_arg(parser, args)
+
+    # do partition
+    _partition(args)
+
+
+if __name__ == '__main__':
+    _utils._safemain(main, __file__)
diff --git a/compiler/one-cmds/one-prepare-venv b/compiler/one-cmds/one-prepare-venv
index 0f75166a7..b435671f4 100644
--- a/compiler/one-cmds/one-prepare-venv
+++ b/compiler/one-cmds/one-prepare-venv
@@ -41,6 +41,7 @@ VER_ONNX_TF=1.10.0
 # Install tensorflow
 
 PIP_TRUSTED_HOST="--trusted-host pypi.org "
+PIP_TRUSTED_HOST+="--trusted-host pypi.python.org "
 PIP_TRUSTED_HOST+="--trusted-host files.pythonhost.org "
 PIP_TRUSTED_HOST+="--trusted-host download.pytorch.org "
 
@@ -62,7 +63,8 @@ else
   ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install tensorflow-cpu==${VER_TENSORFLOW}
 fi
 ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install Pillow
-${VENV_PYTHON} -m pip ${PIP_OPTIONS} install tensorflow_probability
+# TODO remove version fix, https://github.com/Samsung/ONE/issues/9240
+${VENV_PYTHON} -m pip ${PIP_OPTIONS} install tensorflow_probability==0.16.0
 
 # Install PyTorch and ONNX related
 # NOTE set ONE_PREPVENV_TORCH_STABLE to override 'torch_stable.html' URL.
@@ -72,6 +74,8 @@ TORCH_STABLE_URL="https://download.pytorch.org/whl/torch_stable.html"
 if [[ ! -z "$ONE_PREPVENV_TORCH_STABLE" ]]; then
   TORCH_STABLE_URL="${ONE_PREPVENV_TORCH_STABLE}"
 fi
+# TODO remove torch message
+echo "Torch from '${ONE_PREPVENV_TORCH_STABLE}' -> '${TORCH_STABLE_URL}'"
 ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install torch==1.11.0+cpu -f ${TORCH_STABLE_URL}
 
 ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install onnx==${VER_ONNX}
@@ -84,3 +88,7 @@ if [ -n "${EXT_ONNX_TF_WHL}" ]; then
 else
   ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install onnx-tf==${VER_ONNX_TF}
 fi
+
+# NOTE refer https://github.com/protocolbuffers/protobuf/issues/10051
+# TODO remove this when issue is resolved
+${VENV_PYTHON} -m pip ${PIP_OPTIONS} install --upgrade protobuf==3.20.1
diff --git a/compiler/one-cmds/one-profile b/compiler/one-cmds/one-profile
index ed6d8bd7a..b19c215ed 100644
--- a/compiler/one-cmds/one-profile
+++ b/compiler/one-cmds/one-profile
@@ -25,9 +25,7 @@ import glob
 import itertools
 import ntpath
 import os
-import subprocess
 import sys
-import tempfile
 
 import utils as _utils
 
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index f2eff24bd..9282007d8 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -21,11 +21,12 @@
 
 import argparse
 import os
-import subprocess
 import sys
 import tempfile
+import json
 
 import utils as _utils
+from utils import Command
 
 # TODO Find better way to suppress trackback on error
 sys.tracebacklimit = 0
@@ -67,6 +68,12 @@ def _get_parser():
         action='store_true',
         help='generate profiling data')
 
+    # save intermediate file(s)
+    parser.add_argument(
+        '--save_intermediate',
+        action='store_true',
+        help='Save intermediate files to output folder')
+
     ## arguments for quantization
     quantization_group = parser.add_argument_group('arguments for quantization')
 
@@ -93,13 +100,13 @@ def _get_parser():
         '--input_type',
         type=str,
         help=
-        'data type of inputs of quantized model (supported: uint8, int16, default=quantized_dtype). QUANTIZE Op will be inserted at the beginning of the quantized model if input_type is different from quantized_dtype.'
+        'data type of inputs of quantized model (supported: uint8, int16, float32, default=quantized_dtype). QUANTIZE Op will be inserted at the beginning of the quantized model if input_type is different from quantized_dtype.'
     )
     quantization_group.add_argument(
         '--output_type',
         type=str,
         help=
-        'data type of outputs of quantized model (supported: uint8, int16, default=quantized_dtype). QUANTIZE Op will be inserted at the end of the quantized model if output_type is different from quantized_dtype.'
+        'data type of outputs of quantized model (supported: uint8, int16, float32, default=quantized_dtype). QUANTIZE Op will be inserted at the end of the quantized model if output_type is different from quantized_dtype.'
     )
     quantization_group.add_argument(
         '--min_percentile',
@@ -126,10 +133,50 @@ def _get_parser():
         "Force MaxPool Op to have the same input/output quantparams. NOTE: This option can degrade accuracy of some models.)"
     )
     quantization_group.add_argument(
-        '--quant_config',
-        type=str,
+        '--quant_config', type=str, help="Path to the quantization configuration file.")
+    quantization_group.add_argument(
+        '--evaluate_result',
+        action='store_true',
+        help=
+        "Evaluate accuracy of quantized model. Run inference for both fp32 model and the quantized model, and compare the inference results."
+    )
+    quantization_group.add_argument(
+        '--test_data', type=str, help="Path to the test data used for evaluation.")
+    quantization_group.add_argument(
+        '--print_mae',
+        action='store_true',
+        help=
+        "Print MAE (Mean Absolute Error) of inference results between quantized model and fp32 model."
+    )
+    quantization_group.add_argument(
+        '--print_mape',
+        action='store_true',
+        help=
+        "Print MAPE (Mean Absolute Percentage Error) of inference results between quantized model and fp32 model."
+    )
+    quantization_group.add_argument(
+        '--print_mpeir',
+        action='store_true',
+        help=
+        "Print MPEIR (Mean Peak Error to Interval Ratio) of inference results between quantized model and fp32 model."
+    )
+    quantization_group.add_argument(
+        '--print_top1_match',
+        action='store_true',
+        help=
+        "Print Top-1 match ratio of inference results between quantized model and fp32 model."
+    )
+    quantization_group.add_argument(
+        '--print_top5_match',
+        action='store_true',
+        help=
+        "Print Top-5 match ratio of inference results between quantized model and fp32 model."
+    )
+    quantization_group.add_argument(
+        '--print_mse',
+        action='store_true',
         help=
-        "Path to the quantization configuration file."
+        "Print MSE (Mean Squared Error) of inference results between quantized model and fp32 model."
     )
 
     # arguments for force_quantparam option
@@ -162,6 +209,14 @@ def _get_parser():
     copy_quantparam_group.add_argument(
         '--dst_tensor_name', type=str, action='append', help='tensor name (string)')
 
+    # arguments for fake_quant option
+    fake_quant_group = parser.add_argument_group('arguments for fake_quantize option')
+
+    fake_quant_group.add_argument(
+        '--fake_quantize',
+        action='store_true',
+        help='convert quantized model to fake-quantized fp32 model.')
+
     return parser
 
 
@@ -171,8 +226,29 @@ def _set_default_values(args):
         setattr(args, 'input_model_dtype', 'float32')
     if not _utils._is_valid_attr(args, 'quantized_dtype'):
         setattr(args, 'quantized_dtype', 'uint8')
+        if _utils._is_valid_attr(args, 'quant_config'):
+            # Get quantized_dtype from qconfig file
+            try:
+                with open(getattr(args, 'quant_config')) as f:
+                    qconf = json.load(f)
+                    if 'default_quantization_dtype' in qconf:
+                        setattr(args, 'quantized_dtype',
+                                qconf['default_quantization_dtype'])
+            except json.decoder.JSONDecodeError:
+                print('Failed to decode ' + getattr(args, 'quant_config') +
+                      '. Please check it is a json file.')
     if not _utils._is_valid_attr(args, 'granularity'):
         setattr(args, 'granularity', 'layer')
+        if _utils._is_valid_attr(args, 'quant_config'):
+            # Get granularity from qconfig file
+            try:
+                with open(getattr(args, 'quant_config')) as f:
+                    qconf = json.load(f)
+                    if 'default_granularity' in qconf:
+                        setattr(args, 'granularity', qconf['default_granularity'])
+            except json.decoder.JSONDecodeError:
+                print('Failed to decode ' + getattr(args, 'quant_config') +
+                      '. Please check it is a json file.')
     if not _utils._is_valid_attr(args, 'mode'):
         setattr(args, 'mode', 'percentile')
     if not _utils._is_valid_attr(args, 'min_percentile'):
@@ -238,11 +314,18 @@ def _quantize(args):
         _copy_qparam(args)
         return
 
+    if _utils._is_valid_attr(args, 'fake_quantize'):
+        # fake-quantize model
+        _fake_quantize(args)
+        return
+
     # get file path to log
     dir_path = os.path.dirname(os.path.realpath(__file__))
     logfile_path = os.path.realpath(args.output_path) + '.log'
 
     with open(logfile_path, 'wb') as f, tempfile.TemporaryDirectory() as tmpdir:
+        if _utils._is_valid_attr(args, 'save_intermediate'):
+            tmpdir = os.path.dirname(logfile_path)
         # get driver path
         circle_quantizer_path = os.path.join(dir_path, 'circle-quantizer')
         record_minmax_path = os.path.join(dir_path, 'record-minmax')
@@ -263,13 +346,19 @@ def _quantize(args):
             circle_quantizer_cmd.append(getattr(args, 'quantized_dtype'))
         if _utils._is_valid_attr(args, 'granularity'):
             circle_quantizer_cmd.append(getattr(args, 'granularity'))
+        if _utils._is_valid_attr(args, 'quant_config'):
+            # NOTE --config conflicts with --config option in onecc, so
+            # we use quant_config for one-quantize
+            circle_quantizer_cmd.append('--config')
+            circle_quantizer_cmd.append(getattr(args, 'quant_config'))
         # input and output path
         if _utils._is_valid_attr(args, 'input_path'):
             circle_quantizer_cmd.append(getattr(args, 'input_path'))
-        tmp_output_path_1 = os.path.join(
+        tmp_weights_fake_quant_path = os.path.join(
             tmpdir,
-            os.path.splitext(os.path.basename(args.input_path))[0]) + '1.circle'
-        circle_quantizer_cmd.append(tmp_output_path_1)
+            os.path.splitext(os.path.basename(
+                args.input_path))[0]) + '.weights_fake_quant.circle'
+        circle_quantizer_cmd.append(tmp_weights_fake_quant_path)
         # profiling
         if _utils._is_valid_attr(args, 'generate_profile_data'):
             circle_quantizer_cmd.append('--generate_profile_data')
@@ -279,45 +368,23 @@ def _quantize(args):
         # run circle-quantizer
         _utils._run(circle_quantizer_cmd, err_prefix="circle_quantizer", logfile=f)
 
-        ## make a command to record min-max value of each tensor while running the representative dataset
-        circle_record_minmax_cmd = [record_minmax_path]
-        # verbose
-        if _utils._is_valid_attr(args, 'verbose'):
-            circle_record_minmax_cmd.append('--verbose')
-        # input and output path
-        circle_record_minmax_cmd.append('--input_model')
-        circle_record_minmax_cmd.append(tmp_output_path_1)
-        tmp_output_path_2 = os.path.join(
+        tmp_minmax_recorded_path = os.path.join(
             tmpdir,
-            os.path.splitext(os.path.basename(args.input_path))[0]) + '2.circle'
-        circle_record_minmax_cmd.append('--output_model')
-        circle_record_minmax_cmd.append(tmp_output_path_2)
-        # input data
-        if _utils._is_valid_attr(args, 'input_data'):
-            circle_record_minmax_cmd.append('--input_data')
-            circle_record_minmax_cmd.append(getattr(args, 'input_data'))
-        if _utils._is_valid_attr(args, 'input_data_format'):
-            circle_record_minmax_cmd.append('--input_data_format')
-            circle_record_minmax_cmd.append(getattr(args, 'input_data_format'))
-        # min and max percentile
-        if _utils._is_valid_attr(args, 'min_percentile'):
-            circle_record_minmax_cmd.append('--min_percentile')
-            circle_record_minmax_cmd.append(getattr(args, 'min_percentile'))
-        if _utils._is_valid_attr(args, 'max_percentile'):
-            circle_record_minmax_cmd.append('--max_percentile')
-            circle_record_minmax_cmd.append(getattr(args, 'max_percentile'))
-        # mode
-        if _utils._is_valid_attr(args, 'mode'):
-            circle_record_minmax_cmd.append('--mode')
-            circle_record_minmax_cmd.append(getattr(args, 'mode'))
-        # profiling
-        if _utils._is_valid_attr(args, 'generate_profile_data'):
-            circle_record_minmax_cmd.append('--generate_profile_data')
-
-        f.write((' '.join(circle_record_minmax_cmd) + '\n').encode())
+            os.path.splitext(os.path.basename(
+                args.input_path))[0]) + '.minmax_recorded.circle'
 
-        # run record-minmax
-        _utils._run(circle_record_minmax_cmd, err_prefix="record_minmax", logfile=f)
+        ## make a command to record min-max value of each tensor while running the representative dataset
+        record_minmax_cmd = Command(record_minmax_path, args, f)
+        record_minmax_cmd.add_noarg_option_if_valid_arg('--verbose', 'verbose') \
+            .add_option_with_values('--input_model', [tmp_weights_fake_quant_path]) \
+            .add_option_with_values('--output_model', [tmp_minmax_recorded_path]) \
+            .add_option_with_valid_args('--input_data', ['input_data']) \
+            .add_option_with_valid_args('--input_data_format', ['input_data_format']) \
+            .add_option_with_valid_args('--min_percentile', ['min_percentile']) \
+            .add_option_with_valid_args('--max_percentile', ['max_percentile']) \
+            .add_option_with_valid_args('--mode', ['mode']) \
+            .add_noarg_option_if_valid_arg('--generate_profile_data', 'generate_profile_data') \
+            .run()
 
         ## make a second command to quantize the model using the embedded information
         circle_quantizer_cmd = [circle_quantizer_path]
@@ -349,7 +416,7 @@ def _quantize(args):
             circle_quantizer_cmd.append('--config')
             circle_quantizer_cmd.append(getattr(args, 'quant_config'))
         # input and output path
-        circle_quantizer_cmd.append(tmp_output_path_2)
+        circle_quantizer_cmd.append(tmp_minmax_recorded_path)
         if _utils._is_valid_attr(args, 'output_path'):
             circle_quantizer_cmd.append(getattr(args, 'output_path'))
         # profiling
@@ -361,6 +428,38 @@ def _quantize(args):
         # run circle-quantizer
         _utils._run(circle_quantizer_cmd, err_prefix="circle_quantizer", logfile=f)
 
+        # evaluate
+        if _utils._is_valid_attr(args, 'evaluate_result'):
+            circle_eval_diff_path = os.path.join(dir_path, 'circle-eval-diff')
+            quant_model = ""
+            if _utils._is_valid_attr(args, 'output_path'):
+                quant_model = getattr(args, 'output_path')
+            tmp_fake_quant_model = os.path.join(
+                tmpdir,
+                os.path.splitext(os.path.basename(
+                    args.input_path))[0]) + '.fake_quant.circle'
+
+            # do fake quantization
+            fake_quantize_cmd = Command(circle_quantizer_path, args, f)
+            fake_quantize_cmd.add_noarg_option_if_valid_arg('--verbose', 'verbose') \
+                .add_option_with_values('--fake_quantize', [quant_model, tmp_fake_quant_model]) \
+                .run()
+
+            # compare fake-quant model and fp32 model
+            circle_eval_diff_cmd = Command(circle_eval_diff_path, args, f)
+            circle_eval_diff_cmd.add_option_with_valid_args('--first_model', ['input_path']) \
+                .add_option_with_values('--second_model', [tmp_fake_quant_model]) \
+                .add_option_with_valid_args('--first_input_data', ['test_data']) \
+                .add_option_with_valid_args('--second_input_data', ['test_data']) \
+                .add_option_with_valid_args('--input_data_format', ['input_data_format']) \
+                .add_noarg_option_if_valid_arg('--print_mae', 'print_mae') \
+                .add_noarg_option_if_valid_arg('--print_mape', 'print_mape') \
+                .add_noarg_option_if_valid_arg('--print_mpeir', 'print_mpeir') \
+                .add_noarg_option_if_valid_arg('--print_top1_match', 'print_top1_match') \
+                .add_noarg_option_if_valid_arg('--print_top5_match', 'print_top5_match') \
+                .add_noarg_option_if_valid_arg('--print_mse', 'print_mse') \
+                .run()
+
 
 def _write_qparam(args):
     # get file path to log
@@ -433,6 +532,24 @@ def _copy_qparam(args):
         _utils._run(circle_quantizer_cmd, err_prefix="circle_quantizer", logfile=f)
 
 
+def _fake_quantize(args):
+    # get file path to log
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    logfile_path = os.path.realpath(args.output_path) + '.log'
+
+    with open(logfile_path, 'wb') as f:
+        # get driver path
+        circle_quantizer_path = os.path.join(dir_path, 'circle-quantizer')
+        q_model = getattr(args, 'input_path')
+        fq_model = getattr(args, 'output_path')
+
+        # do fake quantization
+        fake_quantize_cmd = Command(circle_quantizer_path, args, f)
+        fake_quantize_cmd.add_noarg_option_if_valid_arg('--verbose', 'verbose') \
+            .add_option_with_values('--fake_quantize', [q_model, fq_model]) \
+            .run()
+
+
 def main():
     # parse arguments
     parser = _get_parser()
diff --git a/compiler/one-cmds/onecc b/compiler/one-cmds/onecc
index 25682ff4b..a5ba636a2 100644
--- a/compiler/one-cmds/onecc
+++ b/compiler/one-cmds/onecc
@@ -25,6 +25,8 @@ import os
 import subprocess
 import sys
 
+from onelib.CfgRunner import CfgRunner
+from onelib.WorkflowRunner import WorkflowRunner
 import utils as _utils
 
 # TODO Find better way to suppress trackback on error
@@ -42,6 +44,7 @@ subtool_list = {
     'backend': {
         'codegen': 'Code generation tool',
         'profile': 'Profile backend model file',
+        'infer': 'Infer backend model file'
     },
 }
 
@@ -64,12 +67,25 @@ def _check_subtool_exists():
 
 
 def _get_parser():
-    onecc_usage = 'onecc [-h] [-v] [-C CONFIG] [COMMAND <args>]'
+    onecc_usage = 'onecc [-h] [-v] [-C CONFIG] [-W WORKFLOW] [-O OPTIMIZATION] [COMMAND <args>]'
     onecc_desc = 'Run ONE driver via several commands or configuration file'
     parser = argparse.ArgumentParser(description=onecc_desc, usage=onecc_usage)
 
     _utils._add_default_arg(parser)
 
+    opt_name_list = _utils._get_optimization_list(get_name=True)
+    opt_name_list = ['-' + s for s in opt_name_list]
+    if not opt_name_list:
+        opt_help_message = '(No available optimization options)'
+    else:
+        opt_help_message = '(Available optimization options: ' + ', '.join(
+            opt_name_list) + ')'
+    opt_help_message = 'optimization name to use ' + opt_help_message
+    parser.add_argument('-O', type=str, metavar='OPTIMIZATION', help=opt_help_message)
+
+    parser.add_argument(
+        '-W', '--workflow', type=str, metavar='WORKFLOW', help='run with workflow file')
+
     # just for help message
     compile_group = parser.add_argument_group('compile to circle model')
     for tool, desc in subtool_list['compile'].items():
@@ -98,45 +114,17 @@ def _parse_arg(parser):
 def _verify_arg(parser, args):
     """verify given arguments"""
     # check if required arguments is given
-    if not _utils._is_valid_attr(args, 'config'):
-        parser.error('-C/--config argument is required')
-
-
-def _get_driver_name(driver_name):
-    return {
-        'one-optimize': 'one-optimize',
-        'one-quantize': 'one-quantize',
-        'one-pack': 'one-pack',
-        'one-codegen': 'one-codegen',
-        'one-profile': 'one-profile'
-    }[driver_name]
-
-
-def _parse_cfg(args):
-    config = configparser.ConfigParser()
-    config.optionxform = str
-    parsed = config.read(os.path.expanduser(getattr(args, 'config')))
-    if not parsed:
-        raise FileNotFoundError('Not found given configuration file')
-    return config
-
-
-def _is_available_driver(config, driver_name):
-    return config.has_option('onecc', driver_name) and config.getboolean(
-        'onecc', driver_name)
-
-
-def _verify_cfg(import_driver_list, config):
-    if not config.has_section('onecc'):
-        raise ImportError('[onecc] section is required in configuration file')
-
-    import_driver_cnt = 0
-    for d in import_driver_list:
-        if _is_available_driver(config, d):
-            import_driver_cnt += 1
-
-    if import_driver_cnt > 1:
-        raise AssertionError('Only one import-* driver can be executed')
+    if not _utils._is_valid_attr(args, 'config') and not _utils._is_valid_attr(
+            args, 'workflow'):
+        parser.error('-C/--config or -W/--workflow argument is required')
+    # check if given optimization option exists
+    opt_name_list = _utils._get_optimization_list(get_name=True)
+    opt_name_list = [_utils._remove_prefix(s, 'O') for s in opt_name_list]
+    if _utils._is_valid_attr(args, 'O'):
+        if ' ' in getattr(args, 'O'):
+            parser.error('Not allowed to have space in the optimization name')
+        if not getattr(args, 'O') in opt_name_list:
+            parser.error('Invalid optimization option')
 
 
 def main():
@@ -158,35 +146,16 @@ def main():
     # verify arguments
     _verify_arg(parser, args)
 
-    # parse configuration file
-    config = _parse_cfg(args)
-
-    # verify configuration file
     bin_dir = os.path.dirname(os.path.realpath(__file__))
-    import_drivers_dict = _utils._detect_one_import_drivers(bin_dir)
-    transform_drivers = [
-        'one-optimize', 'one-quantize', 'one-pack', 'one-codegen', 'one-profile'
-    ]
-    _verify_cfg(import_drivers_dict, config)
-
-    # get sections to run
-    section_to_run = []
-    for d in list(import_drivers_dict) + transform_drivers:
-        if _is_available_driver(config, d):
-            section_to_run.append(d)
-
-    # run
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    for section in section_to_run:
-        if section in import_drivers_dict:
-            # we already has driver name in dict
-            driver_name = import_drivers_dict[section]
-        else:
-            driver_name = _get_driver_name(section)
-        options = ['--config', getattr(args, 'config'), '--section', section]
-        if _utils._is_valid_attr(args, 'verbose'):
-            options.append('--verbose')
-        _call_driver(driver_name, options)
+    if _utils._is_valid_attr(args, 'config'):
+        runner = CfgRunner(args.config)
+        runner.detect_import_drivers(bin_dir)
+        if _utils._is_valid_attr(args, 'O'):
+            runner.add_opt(getattr(args, 'O'))
+        runner.run(bin_dir)
+    elif _utils._is_valid_attr(args, 'workflow'):
+        runner = WorkflowRunner(args.workflow)
+        runner.run(bin_dir)
 
 
 if __name__ == '__main__':
diff --git a/compiler/one-cmds/onecc.template.cfg b/compiler/one-cmds/onecc.template.cfg
index a23d1cea9..6f6a4e266 100644
--- a/compiler/one-cmds/onecc.template.cfg
+++ b/compiler/one-cmds/onecc.template.cfg
@@ -1,28 +1,144 @@
+; To activate a step (or task),
+; set True for the step in [onecc] section and fill options in the corresponding section
 [onecc]
-one-import-tf=True
+; neural network model to circle
+one-import-tf=False
 one-import-tflite=False
 one-import-bcq=False
 one-import-onnx=False
-one-optimize=True
+; circle to circle with optimization
+one-optimize=False
+; circle to circle with quantization
 one-quantize=False
-one-pack=True
+; partition circle
+one-partition=False
+; package circle and metadata into nnpackage
+one-pack=False
+; generate code for backend
 one-codegen=False
+; profile
 one-profile=False
+; infer
+one-infer=False
 
 [one-import-tf]
-input_path=/path/to/inception_v3.pb
-output_path=inception_v3.circle
-input_arrays=input
-input_shapes=1,299,299,3
-output_arrays=InceptionV3/Predictions/Reshape_1
-converter_version=v1
+# mandatory
+; pb file
+input_path=
+; circle file
+output_path=
+# optional
+; v1 or v2
+converter_version=v2
+; graph_def(default), saved_model or keras_model
 model_format=graph_def
+# optional but mandatory for model_format=graph_def
+; input tensor names of the input arrays, comma-separated
+input_arrays=
+; output tensor names of the input arrays, comma-separated
+output_arrays=
+; input shapes corresponding to --input_arrays, colon-separated.(ex:1,4,4,3:1,20,20,3)
+input_shapes=
+
+[one-import-tflite]
+# mandatory
+; tflite file
+input_path=
+; circle file
+output_path=
+
+[one-import-bcq]
+# mandatory
+; bcq file
+input_path=
+; circle file
+output_path=
+# optional
+; v1 or v2
+converter_version=v2
+; graph_def(default), saved_model or keras_model
+model_format=graph_def
+# optional but mandatory for model_format=graph_def
+; input tensor names of the input arrays, comma-separated
+input_arrays=
+; output tensor names of the input arrays, comma-separated
+output_arrays=
+; input shapes corresponding to --input_arrays, colon-separated.(ex:1,4,4,3:1,20,20,3)
+input_shapes=
+
+[one-import-onnx]
+# mandatory
+; onnx file
+input_path=
+; circle file
+output_path=
+# optional
+; True or False
+unroll_rnn=
+; True or False
+unroll_lstm=
 
 [one-optimize]
-input_path=inception_v3.circle
-output_path=inception_v3.opt.circle
-generate_profile_data=False
+# mandatory
+; circle file
+input_path=
+; circle file
+output_path=
+# //TODO: Add available options
+
+[one-quantize]
+# mandatory
+; circle file
+input_path=
+; circle file
+output_path=
+# optional arguments for quantization
+; input data file (if not given, random data will be used for calibration)
+input_data=
+; h5/hdf5(default), list/filelist, or dir/directory
+input_data_format=
+; dtype of quantized model (uint8(default), int16)
+quantized_dtype=
+; granularity of quantization (layer(default), channel)
+granularity=
+; dtype of model's input (uint8, int16, float32). Same with quantized_dtype by default.
+input_type=
+; dtype of model's output (uint8, int16, float32). Same with quantized_dtype by default.
+output_type=
+
+[one-partition]
+# mandatory
+; partition file which provides backend to assign
+part_file=
+; circle file
+input_file=
+# //TODO: Add available options
 
 [one-pack]
-input_path=inception_v3.opt.circle
-output_path=inception_v3_pack
+# mandatory
+; input path
+input_path=
+; output path
+output_path=
+# //TODO: Add available options
+
+[one-codegen]
+# mandatory
+; backend name
+backend=
+; commands for each backend
+command=
+
+[one-profile]
+# mandatory
+; backend name
+backend=
+# //TODO: Add available options
+
+[one-infer]
+# mandatory (mutually exclusive)
+; backend name
+backend=
+; driver name
+driver=
+# //TODO: Add available options
diff --git a/compiler/one-cmds/onelib/CfgRunner.py b/compiler/one-cmds/onelib/CfgRunner.py
new file mode 100644
index 000000000..c66e5b4ba
--- /dev/null
+++ b/compiler/one-cmds/onelib/CfgRunner.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import configparser
+import os
+import warnings
+
+import utils as oneutils
+
+
+def _simple_warning(message, category, filename, lineno, file=None, line=None):
+    return f'{category.__name__}: {message}\n'
+
+
+class CfgRunner:
+    driver_sequence = [
+        'one-optimize', 'one-quantize', 'one-pack', 'one-codegen', 'one-profile',
+        'one-partition', 'one-infer'
+    ]
+
+    def __init__(self, path):
+        self.path = path
+        self.optparser = None
+        self.cfgparser = configparser.ConfigParser()
+        # make option names case sensitive
+        self.cfgparser.optionxform = str
+        parsed = self.cfgparser.read(os.path.expanduser(path))
+        if not parsed:
+            raise FileNotFoundError('Not found given configuration file')
+
+        self._verify_cfg(self.cfgparser)
+        # default import drivers
+        self.import_drivers = [
+            'one-import-bcq', 'one-import-onnx', 'one-import-tf', 'one-import-tflite'
+        ]
+
+    def _verify_cfg(self, cfgparser):
+        if not cfgparser.has_section('onecc'):
+            if cfgparser.has_section('one-build'):
+                warnings.formatwarning = _simple_warning
+                warnings.warn(
+                    "[one-build] section will be deprecated. Please use [onecc] section.")
+            else:
+                raise ImportError('[onecc] section is required in configuration file')
+
+    def _is_available(self, driver):
+        # if there's no `onecc` section, it will find `one-build` section because of backward compatibility
+        return (self.cfgparser.has_option('onecc', driver) and self.cfgparser.getboolean(
+            'onecc', driver)) or (self.cfgparser.has_option('one-build', driver)
+                                  and self.cfgparser.getboolean('one-build', driver))
+
+    def add_opt(self, opt):
+        self.optparser = configparser.ConfigParser()
+        # make option names case sensitive
+        self.optparser.optionxform = str
+        opt_book = dict(
+            zip(oneutils._get_optimization_list(get_name=True),
+                oneutils._get_optimization_list()))
+        parsed = self.optparser.read(opt_book['O' + opt])
+        if not parsed:
+            raise FileNotFoundError('Not found given optimization configuration file')
+        if len(self.optparser.sections()) != 1 or self.optparser.sections(
+        )[0] != 'one-optimize':
+            raise AssertionError(
+                'Optimization configuration file only allowed to have a \'one-optimize\' section'
+            )
+        self.opt = opt
+
+    def detect_import_drivers(self, dir):
+        self.import_drivers = list(oneutils._detect_one_import_drivers(dir).keys())
+
+    def run(self, working_dir, verbose=False):
+        section_to_run = []
+        for d in self.import_drivers + self.driver_sequence:
+            if self._is_available(d):
+                section_to_run.append(d)
+
+        for section in section_to_run:
+            options = ['--config', self.path, '--section', section]
+            if section == 'one-optimize' and self.optparser:
+                options += ['-O', self.opt]
+            if verbose:
+                options.append('--verbose')
+            driver_path = os.path.join(working_dir, section)
+            cmd = [driver_path] + options
+            oneutils._run(cmd)
diff --git a/compiler/one-cmds/onelib/OptionBuilder.py b/compiler/one-cmds/onelib/OptionBuilder.py
new file mode 100644
index 000000000..6a75783ad
--- /dev/null
+++ b/compiler/one-cmds/onelib/OptionBuilder.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from onelib.constant import CONSTANT
+
+
+class OptionBuilder:
+    def __init__(self, one_cmd_type):
+        self.type = one_cmd_type
+
+    def _build_default(self, commands):
+        options = []
+        for k, v in commands.items():
+            options.extend(['--' + k, v])
+        return options
+
+    def _build_with_unknown_command(self, commands):
+        COMMAND_K = 'command'
+        options = []
+        for k, v in commands.items():
+            if k == COMMAND_K:
+                continue
+            options.extend(['--' + k, v])
+        options.extend(['--'])
+        options.extend(commands[COMMAND_K].split())
+        return options
+
+    def _build_import(self, commands):
+        options = []
+        arg_0 = ['save_intermediate']
+        for k, v in commands.items():
+            if k in arg_0 and v == "True":
+                options.extend(['--' + k])
+                continue
+            options.extend(['--' + k, v])
+        return options
+
+    def _build_optimize(self, commands):
+        options = []
+        arg_0 = ['generate_profile_data']
+        arg_1 = ['input_path', 'output_path', 'change_outputs']
+        for k, v in commands.items():
+            if k in arg_1:
+                options.extend(['--' + k, v])
+                continue
+            if k in arg_0 and v == 'True':
+                options.extend(['--' + k])
+                continue
+            for opt in CONSTANT.OPTIMIZATION_OPTS:
+                if k == opt[0] and v == "True":
+                    options.extend(['--' + k])
+                    break
+        return options
+
+    def _build_quantize(self, commands):
+        options = []
+        arg_0 = [
+            'generate_profile_data', 'save_intermediate', 'TF-style_maxpool',
+            'evaluate_result', 'print_mae', 'print_mape', 'print_mpeir',
+            'print_top1_match', 'print_top5_match', 'force_quantparam', 'copy_quantparam'
+        ]
+        for k, v in commands.items():
+            if k in arg_0 and v == "True":
+                options.extend(['--' + k])
+                continue
+            options.extend(['--' + k, v])
+        return options
+
+    def build(self, commands):
+        cmd_book = dict.fromkeys(
+            ['one-import-bcq', 'one-import-tflite', 'one-pack', 'one-partition'],
+            self._build_default)
+        cmd_book['one-codegen'] = self._build_with_unknown_command
+        cmd_book['one-import-onnx'] = self._build_import
+        cmd_book['one-import-pytorch'] = self._build_import
+        cmd_book['one-import-tf'] = self._build_import
+        cmd_book['one-infer'] = self._build_with_unknown_command
+        cmd_book['one-optimize'] = self._build_optimize
+        cmd_book['one-profile'] = self._build_with_unknown_command
+        cmd_book['one-quantize'] = self._build_quantize
+
+        return cmd_book[self.type](commands)
diff --git a/compiler/one-cmds/onelib/TopologicalSortHelper.py b/compiler/one-cmds/onelib/TopologicalSortHelper.py
new file mode 100644
index 000000000..d05adea8d
--- /dev/null
+++ b/compiler/one-cmds/onelib/TopologicalSortHelper.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+
+class TopologicalSortHelper:
+    def __init__(self, vertices):
+        self.graph = defaultdict(list)
+        self.vertices = vertices
+
+    def add_edge(self, u, v):
+        self.graph[u].append(v)
+
+    def sort_util(self, v, visited, stack):
+        visited[v] = True
+
+        for i in self.graph[v]:
+            if visited[i] == False:
+                self.sort_util(i, visited, stack)
+
+        stack.insert(0, v)
+
+    def sort(self):
+        visited = dict.fromkeys(self.vertices, False)
+        stack = []
+
+        for v in self.vertices:
+            if visited[v] == False:
+                self.sort_util(v, visited, stack)
+
+        return stack
diff --git a/compiler/one-cmds/onelib/WorkflowRunner.py b/compiler/one-cmds/onelib/WorkflowRunner.py
new file mode 100644
index 000000000..0482dd9da
--- /dev/null
+++ b/compiler/one-cmds/onelib/WorkflowRunner.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from onelib.OptionBuilder import OptionBuilder
+from onelib.TopologicalSortHelper import TopologicalSortHelper
+from onelib.CfgRunner import CfgRunner
+import utils as oneutils
+
+
+class WorkflowRunner:
+    WORKFLOWS_K = 'workflows'
+    DEPENDENCIES_K = 'run-after'
+    CFG_REFERENCE_K = 'cfg-reference'
+    WORKFLOW_STEPS_K = 'steps'
+    ONE_CMD_TOOL_K = 'one-cmd'
+    COMMANDS_K = 'commands'
+
+    def __init__(self, path):
+        try:
+            with open(path) as f:
+                self.json_contents = json.load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError("Not found given workflow file")
+        except json.decoder.JSONDecodeError:
+            raise ImportError("Invalid workflow file")
+
+        self._verify_workflow(self.json_contents)
+
+        workflows = self.json_contents[self.WORKFLOWS_K]
+        self.adj = dict.fromkeys(workflows, [])
+        # decide the order according to the dependencies of each workflow.
+        helper = TopologicalSortHelper(workflows)
+        for workflow_k in workflows:
+            workflow = self.json_contents[workflow_k]
+            if self.DEPENDENCIES_K in workflow:
+                for previous_workflow in workflow[self.DEPENDENCIES_K]:
+                    helper.add_edge(previous_workflow, workflow_k)
+                    self.adj[previous_workflow].append(workflow_k)
+        self.workflow_sequence = helper.sort()
+
+        self._check_cycle()
+
+    def _check_cycle(self):
+        pos = dict()
+        index = 0
+        workflow_num = len(self.workflow_sequence)
+        # number the order
+        for seq_idx in range(workflow_num):
+            pos[self.workflow_sequence[seq_idx]] = index
+            index += 1
+
+        for seq_idx in range(workflow_num):
+            first_wf = self.workflow_sequence[seq_idx]
+            for adj_wf in self.adj[first_wf]:
+                first_pos = 0 if first_wf not in pos else pos[first_wf]
+                second_pos = 0 if adj_wf not in pos else pos[adj_wf]
+                if (first_pos > second_pos):
+                    raise RuntimeError("Workflows should not have a cycle")
+
+    def _verify_workflow(self, json_contents):
+        # workflow file should have WORKFLOWS_K
+        if not self.WORKFLOWS_K in json_contents:
+            raise ValueError("Not found \"" + self.WORKFLOWS_K +
+                             "\" key in workflow file")
+
+        workflows = json_contents[self.WORKFLOWS_K]
+        # workflow file should have keys listed in WORKFLOWS_K
+        for workflow_k in workflows:
+            if not workflow_k in json_contents:
+                raise ValueError("Not found " + workflow_k + " key listed in \"" +
+                                 self.WORKFLOWS_K + "\"")
+
+        # each workflow should have either WORKFLOW_STEPS_K or CFG_REFERENCE_K
+        for workflow_k in workflows:
+            if not self.WORKFLOW_STEPS_K in json_contents[workflow_k] and not self.CFG_REFERENCE_K in json_contents[workflow_k]:
+                raise ValueError("Each workflow should have either \"" +
+                                 self.WORKFLOW_STEPS_K + "\" or \"" +
+                                 self.CFG_REFERENCE_K + "\"")
+        for workflow_k in workflows:
+            if self.WORKFLOW_STEPS_K in json_contents[workflow_k] and self.CFG_REFERENCE_K in json_contents[workflow_k]:
+                raise ValueError("\"" + self.WORKFLOW_STEPS_K + "\" and \"" +
+                                 self.CFG_REFERENCE_K + "\" are exclusive key")
+
+        # each step should have ONE_CMD_TOOL_K and COMMANDS_K
+        for workflow_k in workflows:
+            workflow = json_contents[workflow_k]
+            if self.WORKFLOW_STEPS_K in workflow:
+                step_keys = workflow[self.WORKFLOW_STEPS_K]
+                for step_k in step_keys:
+                    step = workflow[step_k]
+                    if not self.ONE_CMD_TOOL_K in step or not self.COMMANDS_K in step:
+                        raise ValueError("Each step should have \"" +
+                                         self.ONE_CMD_TOOL_K + "\"" + " and \"" +
+                                         self.COMMANDS_K + "\"")
+
+    def run(self, working_dir, verbose=False):
+        # run workflows in sequence
+        for workflow_k in self.workflow_sequence:
+            workflow = self.json_contents[workflow_k]
+            if self.WORKFLOW_STEPS_K in workflow:
+                steps = workflow[self.WORKFLOW_STEPS_K]
+                for step_k in steps:
+                    step = workflow[step_k]
+                    commands = step[self.COMMANDS_K]
+                    driver_name = step[self.ONE_CMD_TOOL_K]
+                    option_builder = OptionBuilder(driver_name)
+                    options = option_builder.build(commands)
+                    # get the absolute path of the caller
+                    driver_path = os.path.join(working_dir, driver_name)
+                    cmd = [driver_path] + options
+                    oneutils._run(cmd)
+            elif self.CFG_REFERENCE_K in workflow:
+                cfg_path = workflow[self.CFG_REFERENCE_K]['path']
+                runner = CfgRunner(cfg_path)
+                runner.run(working_dir, verbose)
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index 7ddd7382d..7dd79b65d 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -14,11 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class CONSTANT:
     __slots__ = ()  # This prevents access via __dict__.
     OPTIMIZATION_OPTS = (
         # (OPTION_NAME, HELP_MESSAGE)
-        ('O1', 'enable O1 optimization pass'),
         ('convert_nchw_to_nhwc',
          'Experimental: This will convert NCHW operators to NHWC under the assumption that input model is NCHW.'
          ),
@@ -29,6 +29,7 @@ class CONSTANT:
          'convert the output shape of the model (argument for convert_nchw_to_nhwc)'),
         ('fold_add_v2', 'fold AddV2 op with constant inputs'),
         ('fold_cast', 'fold Cast op with constant input'),
+        ('fold_densify', 'fold Densify op with sparse constant input'),
         ('fold_dequantize', 'fold Dequantize op'),
         ('fold_dwconv', 'fold Depthwise Convolution op with constant inputs'),
         ('fold_gather', 'fold Gather op'),
@@ -62,12 +63,16 @@ class CONSTANT:
         ('remove_unnecessary_slice', 'remove unnecessary slice ops'),
         ('remove_unnecessary_strided_slice', 'remove unnecessary strided slice ops'),
         ('remove_unnecessary_split', 'remove unnecessary split ops'),
+        ('replace_non_const_fc_with_batch_matmul',
+         'replace FullyConnected op with non-const weights to BatchMatMul op'),
+        ('replace_sub_with_add', 'replace Sub op with Add op'),
         ('resolve_customop_add', 'convert Custom(Add) op to Add op'),
         ('resolve_customop_batchmatmul',
          'convert Custom(BatchMatmul) op to BatchMatmul op'),
         ('resolve_customop_matmul', 'convert Custom(Matmul) op to Matmul op'),
         ('resolve_customop_max_pool_with_argmax',
          'convert Custom(MaxPoolWithArgmax) to net of builtin operators'),
+        ('resolve_customop_splitv', 'convert Custom(SplitV) op to SplitV op'),
         ('shuffle_weight_to_16x1float32',
          'convert weight format of FullyConnected op to SHUFFLED16x1FLOAT32.'
          ' Note that it only converts weights whose row is a multiple of 16'),
diff --git a/compiler/one-cmds/onelib/make_cmd.py b/compiler/one-cmds/onelib/make_cmd.py
index d8380f28d..0015e8319 100644
--- a/compiler/one-cmds/onelib/make_cmd.py
+++ b/compiler/one-cmds/onelib/make_cmd.py
@@ -19,6 +19,7 @@ import sys
 
 import onelib.constant as _constant
 
+
 def _is_valid_attr(args, attr):
     return hasattr(args, attr) and getattr(args, attr)
 
@@ -64,6 +65,10 @@ def make_tf2tfliteV2_cmd(args, driver_path, input_path, output_path):
         cmd.append('--output_arrays')
         cmd.append(getattr(args, 'output_arrays'))
 
+    # experimental options
+    if _is_valid_attr(args, 'experimental_disable_batchmatmul_unfold'):
+        cmd.append('--experimental_disable_batchmatmul_unfold')
+
     return cmd
 
 
diff --git a/compiler/one-cmds/onnx_legalizer.py b/compiler/one-cmds/onnx_legalizer.py
index 26c2b75b9..0141514b6 100755
--- a/compiler/one-cmds/onnx_legalizer.py
+++ b/compiler/one-cmds/onnx_legalizer.py
@@ -341,7 +341,8 @@ def _dtype_to_np(dtype):
         raise NotImplementedError('unsupported data type')
 
 
-def _generate_one_direction_RNN(transformer, X, W, R, B, initial_h, clip, activation_name):
+def _generate_one_direction_RNN(transformer, X, W, R, B, initial_h, clip,
+                                activation_name):
     """Generate subgraph of one direction of unrolled RNN layer
 
     Args:
@@ -395,7 +396,7 @@ def _generate_one_direction_RNN(transformer, X, W, R, B, initial_h, clip, activa
 
 
 def _transform_unidirectional_RNN(transformer, original_node, x, tensor_infos, activation,
-                                 clip, direction, hidden_size, layout):
+                                  clip, direction, hidden_size, layout):
     """Generate Simple (forward or reverse) unrolled RNN
 
     Args:
@@ -432,7 +433,7 @@ def _transform_unidirectional_RNN(transformer, original_node, x, tensor_infos, a
     else:
         initial_h = None
     state_tensors = _generate_one_direction_RNN(transformer, x, w, r, b, initial_h, clip,
-                                               activation)
+                                                activation)
     y_direction_dim = layout + 1
     y_h_direction_dim = layout
     state_layout_tensors = []
@@ -447,12 +448,11 @@ def _transform_unidirectional_RNN(transformer, original_node, x, tensor_infos, a
     transformer.make_node(
         'Unsqueeze', [state_tensors[-1]], [Y_h], axes=[y_h_direction_dim])
     Y = outputs[0]
-    transformer.make_node(
-        'Concat', state_layout_tensors, [Y], axis=seq_length_dim)
+    transformer.make_node('Concat', state_layout_tensors, [Y], axis=seq_length_dim)
 
 
 def _transform_bidirectional_RNN(transformer, original_node, x, tensor_infos, activations,
-                                clip, hidden_size, layout):
+                                 clip, hidden_size, layout):
     """Generate Bidirectional unrolled RNN
 
     Args:
@@ -503,10 +503,10 @@ def _transform_bidirectional_RNN(transformer, original_node, x, tensor_infos, ac
             initial_h[d] = transformer.make_squeeze(initial_h[d], axes=[direction_dim])
 
     state_f_tensors = _generate_one_direction_RNN(transformer, x, w[0], r[0], b[0],
-                                                 initial_h[0], clip, activations[0])
+                                                  initial_h[0], clip, activations[0])
     x.reverse()
     state_b_tensors = _generate_one_direction_RNN(transformer, x, w[1], r[1], b[1],
-                                                 initial_h[1], clip, activations[1])
+                                                  initial_h[1], clip, activations[1])
     state_b_tensors.reverse()
 
     y_direction_dim = layout + 1
@@ -538,8 +538,7 @@ def _transform_bidirectional_RNN(transformer, original_node, x, tensor_infos, ac
         axis=y_h_direction_dim)
 
     Y = outputs[0]
-    transformer.make_node(
-        'Concat', state_layout_tensors, [Y], axis=seq_length_dim)
+    transformer.make_node('Concat', state_layout_tensors, [Y], axis=seq_length_dim)
 
 
 def _legalize_RNN(transformer, tensor_infos, node):
@@ -600,10 +599,10 @@ def _legalize_RNN(transformer, tensor_infos, node):
 
     if direction in ['forward', 'reverse']:
         _transform_unidirectional_RNN(transformer, node, x, tensor_infos, activations[0],
-                                     clip, direction, hidden_size, layout)
+                                      clip, direction, hidden_size, layout)
     elif direction == 'bidirectional':
-        _transform_bidirectional_RNN(transformer, node, x, tensor_infos, activations, clip,
-                                    hidden_size, layout)
+        _transform_bidirectional_RNN(transformer, node, x, tensor_infos, activations,
+                                     clip, hidden_size, layout)
     else:
         raise RuntimeError('Unknown RNN type')
 
@@ -611,7 +610,7 @@ def _legalize_RNN(transformer, tensor_infos, node):
 
 
 def _generate_one_direction_LSTM(transformer, X, W, R, B, initial_h, initial_c, P, clip,
-                                act, dtype, hidden_size, batch_size):
+                                 act, dtype, hidden_size, batch_size):
     """Generate subgraph for one direction of unrolled LSTM layer
 
     Args:
@@ -754,7 +753,7 @@ def _generate_one_direction_LSTM(transformer, X, W, R, B, initial_h, initial_c,
 
 
 def _transform_unidirectional_LSTM(transformer, original_node, x, tensor_infos,
-                                  activations, clip, direction, hidden_size, layout):
+                                   activations, clip, direction, hidden_size, layout):
     """Generate Simple (forward or reverse) unrolled LSTM
 
     Args:
@@ -818,17 +817,15 @@ def _transform_unidirectional_LSTM(transformer, original_node, x, tensor_infos,
     transformer.make_node(
         'Unsqueeze', [state_h_tensors[-1]], [Y_h], axes=[y_h_direction_dim])
     Y_c = outputs[2]
-    transformer.make_node(
-        'Unsqueeze', [state_c_tensor], [Y_c], axes=[y_h_direction_dim])
+    transformer.make_node('Unsqueeze', [state_c_tensor], [Y_c], axes=[y_h_direction_dim])
     if direction == 'reverse':
         state_layout_tensors.reverse()
     Y = outputs[0]
-    transformer.make_node(
-        'Concat', state_layout_tensors, [Y], axis=seq_length_dim)
+    transformer.make_node('Concat', state_layout_tensors, [Y], axis=seq_length_dim)
 
 
-def _transform_bidirectional_LSTM(transformer, original_node, x, tensor_infos, activations,
-                                 clip, hidden_size, layout):
+def _transform_bidirectional_LSTM(transformer, original_node, x, tensor_infos,
+                                  activations, clip, hidden_size, layout):
     """Generate Bidirectional unrolled LSTM
 
     Args:
@@ -929,12 +926,10 @@ def _transform_bidirectional_LSTM(transformer, original_node, x, tensor_infos, a
     Y_f_c = transformer.make_unsqueeze(state_f_c_tensor, axes=[y_c_direction_dim])
     Y_b_c = transformer.make_unsqueeze(state_b_c_tensor, axes=[y_c_direction_dim])
     Y_c = outputs[2]
-    transformer.make_node(
-        'Concat', [Y_f_c, Y_b_c], [Y_c], axis=y_c_direction_dim)
+    transformer.make_node('Concat', [Y_f_c, Y_b_c], [Y_c], axis=y_c_direction_dim)
 
     Y = outputs[0]
-    transformer.make_node(
-        'Concat', state_layout_tensors, [Y], axis=seq_length_dim)
+    transformer.make_node('Concat', state_layout_tensors, [Y], axis=seq_length_dim)
 
 
 def _legalize_LSTM(transformer, tensor_infos, node):
@@ -1001,10 +996,10 @@ def _legalize_LSTM(transformer, tensor_infos, node):
 
     if direction in ['forward', 'reverse']:
         _transform_unidirectional_LSTM(transformer, node, x, tensor_infos, activations,
-                                      clip, direction, hidden_size, layout)
+                                       clip, direction, hidden_size, layout)
     elif direction == 'bidirectional':
         _transform_bidirectional_LSTM(transformer, node, x, tensor_infos, activations,
-                                     clip, hidden_size, layout)
+                                      clip, hidden_size, layout)
     else:
         raise RuntimeError('Unknown LSTM type')
 
@@ -1052,10 +1047,12 @@ def legalize(model, options):
 
 if __name__ == '__main__':
     if len(sys.argv) < 3:
-        print('usage: ./legalize_onnx.py <path to input model> <path to output model>\n'
-              '\n'
-              '    In stand-alone utility mode this tool provides basic funtionality\n'
-              '    If you want to have more control over applied transformations, use this legalizer as a library')
+        print(
+            'usage: ./legalize_onnx.py <path to input model> <path to output model>\n'
+            '\n'
+            '    In stand-alone utility mode this tool provides basic funtionality\n'
+            '    If you want to have more control over applied transformations, use this legalizer as a library'
+        )
         exit(1)
     options = LegalizeOptions()
     options.unroll_lstm = True
diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
index b1aabdb97..c27920980 100644
--- a/compiler/one-cmds/requires.cmake
+++ b/compiler/one-cmds/requires.cmake
@@ -1,6 +1,7 @@
 require("tf2tfliteV2")
 require("tflite2circle")
 require("circle2circle")
+require("circle-eval-diff")
 require("circle-quantizer")
 require("record-minmax")
 require("vconone")
diff --git a/compiler/one-cmds/tests/CMakeLists.txt b/compiler/one-cmds/tests/CMakeLists.txt
index caea756c2..17f55ec96 100644
--- a/compiler/one-cmds/tests/CMakeLists.txt
+++ b/compiler/one-cmds/tests/CMakeLists.txt
@@ -4,6 +4,8 @@
 file(GLOB TESTITEMS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.test")
 file(GLOB CONFIGITEMS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.cfg")
 file(GLOB QCONFIGITEMS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.qconf.json")
+file(GLOB PYSCRIPTS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.py")
+file(GLOB WORKFLOWITEMS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "./*.workflow.json")
 
 # Create a script to run the tests at installation folder
 set(DRIVER_SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/runtestall.sh")
@@ -45,6 +47,16 @@ foreach(QCONFIGITEM IN ITEMS ${QCONFIGITEMS})
   install(FILES ${QCONFIGITEM} DESTINATION test)
 endforeach(QCONFIGITEM)
 
+foreach(PYSCRIPT IN ITEMS ${PYSCRIPTS})
+  get_filename_component(ITEM_PREFIX ${PYSCRIPT} NAME_WE)
+  install(FILES ${PYSCRIPT} DESTINATION test)
+endforeach(PYSCRIPT)
+
+foreach(WORKFLOWITEM IN ITEMS ${WORKFLOWITEMS})
+  get_filename_component(ITEM_PREFIX ${WORKFLOWITEM} NAME_WE)
+  install(FILES ${WORKFLOWITEM} DESTINATION test)
+endforeach(WORKFLOWITEM)
+
 file(APPEND "${DRIVER_SCRIPT}" "popd > /dev/null\n\n")
 
 file(APPEND "${DRIVER_SCRIPT}"
diff --git a/compiler/one-cmds/tests/OONECC_024.cfg b/compiler/one-cmds/tests/OONECC_024.cfg
new file mode 100644
index 000000000..a39aae071
--- /dev/null
+++ b/compiler/one-cmds/tests/OONECC_024.cfg
@@ -0,0 +1,2 @@
+[one-optimize]
+make_batchnorm_gamma_positive=True
diff --git a/compiler/one-cmds/tests/one-build_008.cfg b/compiler/one-cmds/tests/one-build_008.cfg
index 615047c86..8c777f64f 100644
--- a/compiler/one-cmds/tests/one-build_008.cfg
+++ b/compiler/one-cmds/tests/one-build_008.cfg
@@ -15,7 +15,6 @@ output_path=test_onnx_model.circle
 [one-optimize]
 input_path=test_onnx_model.circle
 output_path=test_onnx_model.opt.circle
-all=True
 remove_redundant_transpose=True
 
 [one-codegen]
diff --git a/compiler/one-cmds/tests/one-build_009.cfg b/compiler/one-cmds/tests/one-build_009.cfg
index 66bca250d..b5a35dd97 100644
--- a/compiler/one-cmds/tests/one-build_009.cfg
+++ b/compiler/one-cmds/tests/one-build_009.cfg
@@ -15,7 +15,6 @@ output_path=onnx_conv2d_conv2d.circle
 [one-optimize]
 input_path=onnx_conv2d_conv2d.circle
 output_path=onnx_conv2d_conv2d.opt.circle
-all=True
 remove_redundant_transpose=True
 convert_nchw_to_nhwc=True
 
diff --git a/compiler/one-cmds/tests/one-import-onnx_002.test b/compiler/one-cmds/tests/one-import-onnx_002.test
new file mode 100644
index 000000000..a6a38eee5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-onnx_002.test
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# test for experimental_disable_batchmatmul_unfold option
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./reshape_matmul.onnx"
+outputfile="./reshape_matmul.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test without option that should drop FULLY_CONNECTED
+one-import-onnx \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+circle-operator --code reshape_matmul.circle > ${outputfile}.log 2>&1
+
+if ! grep -q "FULLY_CONNECTED" "${outputfile}.log"; then
+  trap_err_onexit
+fi
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test with option that should drop BATCH_MATMUL
+one-import-onnx \
+--experimental_disable_batchmatmul_unfold \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+circle-operator --code reshape_matmul.circle > ${outputfile}.log 2>&1
+
+if ! grep -q "BATCH_MATMUL" "${outputfile}.log"; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
+exit 0
diff --git a/compiler/one-cmds/tests/one-infer-test-post-process.py b/compiler/one-cmds/tests/one-infer-test-post-process.py
new file mode 100644
index 000000000..0f0e0d701
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer-test-post-process.py
@@ -0,0 +1,16 @@
+# This script gets one argument and print it
+
+import sys
+from pathlib import Path
+
+
+def main():
+    if len(sys.argv) < 2:
+        filepath = Path(sys.argv[0])
+        sys.exit("Usage: " + filepath.name + " [Word to print]")
+    word = sys.argv[1]
+    print(word)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/compiler/one-cmds/tests/one-infer_001.test b/compiler/one-cmds/tests/one-infer_001.test
new file mode 100644
index 000000000..e7b569522
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_001.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/help-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# copy help-infer to bin folder
+cp help-infer ../bin/help-infer
+
+# run test
+one-infer -b help -- -h > ${filename}.log
+
+rm -rf ../bin/help-infer
+
+if grep -q "HELP MESSAGE!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_002.test b/compiler/one-cmds/tests/one-infer_002.test
new file mode 100644
index 000000000..22070de19
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_002.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="sample.tvn"
+
+if [[ ! -s "${inputfile}" ]]; then
+  touch ${inputfile}
+fi
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+one-infer -d dummy-infer -- ${inputfile} > ${filename}.log
+
+rm -rf ../bin/dummy-infer
+
+if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_003.test b/compiler/one-cmds/tests/one-infer_003.test
new file mode 100644
index 000000000..e2aa459a1
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_003.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="sample.tvn"
+
+if [[ ! -s "${inputfile}" ]]; then
+  touch ${inputfile}
+fi
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+one-infer -b dummy -- ${inputfile} > ${filename}.log
+
+rm -rf ../bin/dummy-infer
+
+if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_004.test b/compiler/one-cmds/tests/one-infer_004.test
new file mode 100644
index 000000000..a4cb76c55
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_004.test
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# print one-infer's help message
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-infer -h > ${filename}.log
+
+if grep -q "command line tool to infer model" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_005.cfg b/compiler/one-cmds/tests/one-infer_005.cfg
new file mode 100644
index 000000000..aca687801
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_005.cfg
@@ -0,0 +1,3 @@
+[one-infer]
+backend=dummy
+command=sample.tvn
diff --git a/compiler/one-cmds/tests/one-infer_005.test b/compiler/one-cmds/tests/one-infer_005.test
new file mode 100644
index 000000000..a44dd0e25
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_005.test
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-infer with configuration input
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-infer_005.cfg"
+inputfile="sample.tvn"
+
+if [[ ! -s "${inputfile}" ]]; then
+  touch ${inputfile}
+fi
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+one-infer -C ${configfile} > ${filename}.log
+
+rm -rf ../bin/dummy-infer
+
+if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_006.test b/compiler/one-cmds/tests/one-infer_006.test
new file mode 100644
index 000000000..2612133a3
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_006.test
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-infer with post process script
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="sample.tvn"
+
+if [[ ! -s "${inputfile}" ]]; then
+  touch ${inputfile}
+fi
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+one-infer -b dummy --post-process "./one-infer-test-post-process.py TOKEN" -- ${inputfile} > ${filename}.log 2>&1
+return_code=$?
+
+rm -rf ../bin/dummy-infer
+
+if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+  if [ "$return_code" -eq "0" ]; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-infer_neg_001.test b/compiler/one-cmds/tests/one-infer_neg_001.test
new file mode 100644
index 000000000..62e721128
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_neg_001.test
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with no input
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "error: the following arguments are required: {-d/--driver | -b/--backend}" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-infer > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-infer_neg_002.test b/compiler/one-cmds/tests/one-infer_neg_002.test
new file mode 100644
index 000000000..fa88876e8
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_neg_002.test
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# passed driver is not found
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+driver_name="neg-infer"
+
+trap_err_onexit()
+{
+  if grep -q "FileNotFoundError: ${driver_name} not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-infer -d ${driver_name} -- -h> ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-infer_neg_003.test b/compiler/one-cmds/tests/one-infer_neg_003.test
new file mode 100644
index 000000000..a0005520f
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_neg_003.test
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# passed backend is not found
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+backend_name="neg"
+
+trap_err_onexit()
+{
+  if grep -q "FileNotFoundError: ${backend_name}-infer not found" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-infer -b ${backend_name} -- -h> ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-infer_neg_004.test b/compiler/one-cmds/tests/one-infer_neg_004.test
new file mode 100644
index 000000000..b9130d051
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_neg_004.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# both -b and -d option drivers are given as argument
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+backend_name="neg"
+driver_name="neg2"
+
+trap_err_onexit()
+{
+  if grep -q "\-d and -b options are mutually exclusive. Please use only one of them" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-infer -d ${driver_name} -b ${backend_name} -- -h> ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-infer_neg_005.test b/compiler/one-cmds/tests/one-infer_neg_005.test
new file mode 100644
index 000000000..9074debcf
--- /dev/null
+++ b/compiler/one-cmds/tests/one-infer_neg_005.test
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-infer with invalid post process script
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  return_code=$?
+  if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+    # Case of succeed of inference driver but error after it
+    if [ "$return_code" -ne "0" ]; then
+      echo "${filename_ext} SUCCESS"
+      exit 0
+    fi
+  fi
+
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-infer
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="sample.tvn"
+
+if [[ ! -s "${inputfile}" ]]; then
+  touch ${inputfile}
+fi
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+one-infer -b dummy --post-process "./one-infer-test-post-process.py" -- ${inputfile} > ${filename}.log 2>&1
+
+rm -rf ../bin/dummy-infer
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-optimize_001.test b/compiler/one-cmds/tests/one-optimize_001.test
index 8eb58f4eb..4152fa3dd 100644
--- a/compiler/one-cmds/tests/one-optimize_001.test
+++ b/compiler/one-cmds/tests/one-optimize_001.test
@@ -40,7 +40,7 @@ if [[ ! -s ${inputfile} ]]; then
 fi
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --input_path ${inputfile} \
 --output_path ${outputfile} > /dev/null 2>&1
 
diff --git a/compiler/one-cmds/tests/one-optimize_002.test b/compiler/one-cmds/tests/one-optimize_002.test
index bd64494be..58f792bf8 100644
--- a/compiler/one-cmds/tests/one-optimize_002.test
+++ b/compiler/one-cmds/tests/one-optimize_002.test
@@ -40,7 +40,7 @@ if [[ ! -s ${inputfile} ]]; then
 fi
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --change_outputs InceptionV3/Logits/SpatialSqueeze1 \
 --input_path ${inputfile} \
 --output_path ${outputfile} > /dev/null 2>&1
diff --git a/compiler/one-cmds/tests/one-optimize_neg_001.test b/compiler/one-cmds/tests/one-optimize_neg_001.test
index f0b5563c7..c67e3d489 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_001.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_001.test
@@ -39,7 +39,7 @@ rm -rf ${outputfile}
 rm -rf ${outputfile}.log
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log 2>&1
 
diff --git a/compiler/one-cmds/tests/one-optimize_neg_002.test b/compiler/one-cmds/tests/one-optimize_neg_002.test
index 72f306e20..a1ef70216 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_002.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_002.test
@@ -39,7 +39,7 @@ rm -rf ${outputfile}
 rm -rf ${outputfile}.log
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log 2>&1
 
diff --git a/compiler/one-cmds/tests/one-optimize_neg_003.test b/compiler/one-cmds/tests/one-optimize_neg_003.test
index 3fe7d330e..668a6c29d 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_003.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_003.test
@@ -44,7 +44,7 @@ if [[ ! -s ${inputfile} ]]; then
 fi
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --input_path "${inputfile}" > "${filename}.log" 2>&1
 
 echo "${filename_ext} FAILED"
diff --git a/compiler/one-cmds/tests/one-optimize_neg_004.test b/compiler/one-cmds/tests/one-optimize_neg_004.test
index e73911b54..5abd4c553 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_004.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_004.test
@@ -39,7 +39,7 @@ rm -rf ${outputfile}
 rm -rf ${filename}.log
 
 # run test
-one-optimize --O1 \
+one-optimize --resolve_customop_add \
 --change_outputs non_existing_node_name \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log 2>&1
diff --git a/compiler/one-cmds/tests/one-partition_001.test b/compiler/one-cmds/tests/one-partition_001.test
new file mode 100644
index 000000000..a6fba07d7
--- /dev/null
+++ b/compiler/one-cmds/tests/one-partition_001.test
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+testmodel="Net_InstanceNorm_003"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="${testmodel}.circle"
+partfile="${testmodel}.part"
+outputfile="${testmodel}.conn.json"
+
+rm -rf  ${testmodel}.000*
+rm -rf  ${testmodel}.conn.*
+rm -rf  ${testmodel}.*.log
+
+# run test
+one-partition \
+--input_file ${inputfile} \
+--part_file ${partfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-partition_neg_001.test b/compiler/one-cmds/tests/one-partition_neg_001.test
new file mode 100644
index 000000000..d54a94fa2
--- /dev/null
+++ b/compiler/one-cmds/tests/one-partition_neg_001.test
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid .part file (wrong comply value)
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+testmodel="Net_InstanceNorm_003"
+
+trap_err_onexit()
+{
+  if grep -q "ERROR" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="${testmodel}.circle"
+partfile="${testmodel}.neg.part"
+outputfile="${testmodel}.conn.json"
+
+rm -rf  ${testmodel}.000*
+rm -rf  ${testmodel}.conn.*
+rm -rf  ${testmodel}.*.log
+rm -rf ${filename}.log
+
+# run test
+one-partition \
+--input_file ${inputfile} \
+--part_file ${partfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-partition_neg_002.test b/compiler/one-cmds/tests/one-partition_neg_002.test
new file mode 100644
index 000000000..23fe84c05
--- /dev/null
+++ b/compiler/one-cmds/tests/one-partition_neg_002.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid .cfg file (no one-partition section)
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+testmodel="Net_InstanceNorm_003"
+
+trap_err_onexit()
+{
+  if grep -q "'one-partition' section" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+cfgfile="${testmodel}.neg.cfg"
+
+rm -rf  ${testmodel}.000*
+rm -rf  ${testmodel}.conn.*
+rm -rf  ${testmodel}.*.log
+rm -rf ${filename}.log
+
+# run test
+one-partition -C ${cfgfile}> ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-quantize_010.test b/compiler/one-cmds/tests/one-quantize_010.test
new file mode 100644
index 000000000..1095ba0a0
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_010.test
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "MPEIR for InceptionV3/Predictions/Reshape_1 is" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.one-quantize_010.q.circle"
+datafile="./inception_v3_test_data.h5"
+
+rm -rf ${outputfile}
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test > /dev/null 2>&1
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--granularity channel \
+--input_path ${inputfile} \
+--input_data ${datafile} \
+--output_path ${outputfile} \
+--evaluate_result \
+--test_data ${datafile} \
+--print_mpeir > ${filename}.log 2>&1
+
+check_message
diff --git a/compiler/one-cmds/tests/one-quantize_011.test b/compiler/one-cmds/tests/one-quantize_011.test
new file mode 100644
index 000000000..34d7f57b5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_011.test
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "Mean Top-5 match ratio for InceptionV3/Predictions/Reshape_1 is" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.one-quantize_011.q.circle"
+datafile="./inception_v3_test_data.h5"
+
+rm -rf ${outputfile}
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--granularity channel \
+--input_path ${inputfile} \
+--input_data ${datafile} \
+--output_path ${outputfile} \
+--evaluate_result \
+--test_data ${datafile} \
+--print_top5_match > ${filename}.log 2>&1
+
+check_message
diff --git a/compiler/one-cmds/tests/one-quantize_012.qconf.json b/compiler/one-cmds/tests/one-quantize_012.qconf.json
new file mode 100644
index 000000000..4a15b04f5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_012.qconf.json
@@ -0,0 +1,16 @@
+{
+    "default_quantization_dtype" : "uint8",
+    "default_granularity" : "channel",
+    "layers" : [
+        {
+            "names" : ["InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu;InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/FusedBatchNorm;InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D;InceptionV3/InceptionV3/Conv2d_2b_3x3/Conv2D",
+            "InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool",
+            "InceptionV3/InceptionV3/Mixed_5b/concat",
+            "InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool",
+            "InceptionV3/InceptionV3/Mixed_7c/concat",
+            "InceptionV3/Predictions/Reshape_1"],
+            "dtype" : "int16",
+            "granularity" : "channel"
+        }
+    ]
+}
diff --git a/compiler/one-cmds/tests/one-quantize_012.test b/compiler/one-cmds/tests/one-quantize_012.test
new file mode 100644
index 000000000..fba18acc5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_012.test
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.one-quantize_012.q.circle"
+
+rm -rf ${outputfile}
+
+# run test without input data
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--granularity channel \
+--quant_config one-quantize_012.qconf.json \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_013.qconf.json b/compiler/one-cmds/tests/one-quantize_013.qconf.json
new file mode 100644
index 000000000..4a15b04f5
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_013.qconf.json
@@ -0,0 +1,16 @@
+{
+    "default_quantization_dtype" : "uint8",
+    "default_granularity" : "channel",
+    "layers" : [
+        {
+            "names" : ["InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu;InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/FusedBatchNorm;InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D;InceptionV3/InceptionV3/Conv2d_2b_3x3/Conv2D",
+            "InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool",
+            "InceptionV3/InceptionV3/Mixed_5b/concat",
+            "InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool",
+            "InceptionV3/InceptionV3/Mixed_7c/concat",
+            "InceptionV3/Predictions/Reshape_1"],
+            "dtype" : "int16",
+            "granularity" : "channel"
+        }
+    ]
+}
diff --git a/compiler/one-cmds/tests/one-quantize_013.test b/compiler/one-cmds/tests/one-quantize_013.test
new file mode 100644
index 000000000..fd443d627
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_013.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# quantized_dtype and granularity are given by qconfig file
+# (not by command line interface)
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.one-quantize_013.q.circle"
+
+rm -rf ${outputfile}
+
+# run test without input data
+# quantized_dtype and granularity are not given here
+one-quantize \
+--input_dtype float32 \
+--quant_config one-quantize_013.qconf.json \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_014.test b/compiler/one-cmds/tests/one-quantize_014.test
new file mode 100644
index 000000000..518c32841
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_014.test
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test if `circle-eval-diff` supports directory input.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "Mean Top-5 match ratio for InceptionV3/Predictions/Reshape_1 is" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.one-quantize_014.q.circle"
+datadir="./raw_files/"
+
+rm -rf ${outputfile}
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--granularity channel \
+--input_path ${inputfile} \
+--input_data ${datadir} \
+--input_data_format dir \
+--output_path ${outputfile} \
+--evaluate_result \
+--test_data ${datadir} \
+--print_top5_match > ${filename}.log 2>&1
+
+check_message
diff --git a/compiler/one-cmds/tests/one-quantize_015.test b/compiler/one-cmds/tests/one-quantize_015.test
new file mode 100644
index 000000000..bb45b5722
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_015.test
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test if --fake_quantize option works well
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.mat.q8.circle"
+outputfile="./inception_v3.one-quantize_015.fq.circle"
+
+rm -rf ${outputfile}
+
+# run test
+one-quantize \
+--fake_quantize \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_neg_019.test b/compiler/one-cmds/tests/one-quantize_neg_019.test
index ac920a4fe..e182edf78 100644
--- a/compiler/one-cmds/tests/one-quantize_neg_019.test
+++ b/compiler/one-cmds/tests/one-quantize_neg_019.test
@@ -42,7 +42,7 @@ one-quantize \
 --input_dtype float32 \
 --quantized_dtype int16 \
 --granularity channel \
---input_type float32 \
+--input_type float64 \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log 2>&1
 
diff --git a/compiler/one-cmds/tests/one-quantize_neg_020.test b/compiler/one-cmds/tests/one-quantize_neg_020.test
new file mode 100644
index 000000000..27b11c3e6
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_020.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# check error message is printed when qconfig file is not json
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Failed to decode" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.quantized.neg_020.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-quantize \
+--input_dtype float32 \
+--quant_config one-quantize_neg_020.test \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_008.cfg b/compiler/one-cmds/tests/onecc_008.cfg
index 0be026e6e..020e274e1 100644
--- a/compiler/one-cmds/tests/onecc_008.cfg
+++ b/compiler/one-cmds/tests/onecc_008.cfg
@@ -15,7 +15,6 @@ output_path=test_onnx_model.circle
 [one-optimize]
 input_path=test_onnx_model.circle
 output_path=test_onnx_model.opt.circle
-all=True
 remove_redundant_transpose=True
 
 [one-codegen]
diff --git a/compiler/one-cmds/tests/onecc_009.cfg b/compiler/one-cmds/tests/onecc_009.cfg
index a17ae59cb..86121c557 100644
--- a/compiler/one-cmds/tests/onecc_009.cfg
+++ b/compiler/one-cmds/tests/onecc_009.cfg
@@ -15,7 +15,6 @@ output_path=onnx_conv2d_conv2d.circle
 [one-optimize]
 input_path=onnx_conv2d_conv2d.circle
 output_path=onnx_conv2d_conv2d.opt.circle
-all=True
 remove_redundant_transpose=True
 convert_nchw_to_nhwc=True
 
diff --git a/compiler/one-cmds/tests/onecc_024.cfg b/compiler/one-cmds/tests/onecc_024.cfg
new file mode 100644
index 000000000..7b4b1a80a
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_024.cfg
@@ -0,0 +1,22 @@
+[onecc]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=False
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v1
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
+make_batchnorm_gamma_positive=False
diff --git a/compiler/one-cmds/tests/onecc_024.test b/compiler/one-cmds/tests/onecc_024.test
new file mode 100644
index 000000000..1f5daa13e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_024.test
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use `OONECC_024` optimization option
+
+: '
+This test assumes below directories.
+
+[one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    └── test # pwd
+'
+
+OPT_ALREADY_EXIST=true
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+clean_envir()
+{
+  rm -rf ../optimization/OONECC_024.cfg
+  if [ "$OPT_ALREADY_EXIST" = false ]; then
+    rm -rf ../optimization
+  fi
+}
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  clean_envir
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_024.cfg"
+outputfile="inception_v3.opt.circle"
+
+rm -rf ${outputfile}
+
+if [ ! -d "../optimization" ]; then
+  mkdir -p ../optimization
+  OPT_ALREADY_EXIST=false
+fi
+
+cp OONECC_024.cfg ../optimization
+
+# run test
+LUCI_LOG=5 onecc -C ${configfile} -OONECC_024 > ${filename}.log 2>&1
+
+clean_envir
+
+if grep -q "MakeBatchNormGammaPositivePass" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/onecc_025.cfg b/compiler/one-cmds/tests/onecc_025.cfg
new file mode 100644
index 000000000..4776ea80e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_025.cfg
@@ -0,0 +1,20 @@
+[onecc]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v2
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
diff --git a/compiler/one-cmds/tests/onecc_025.test b/compiler/one-cmds/tests/onecc_025.test
new file mode 100644
index 000000000..396f40cea
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_025.test
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-optimize with the configuration file that includes `onecc` section
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_001.cfg"
+outputfile="inception_v3.opt.circle"
+
+# run test
+onecc -C ${configfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_026.cfg b/compiler/one-cmds/tests/onecc_026.cfg
new file mode 100644
index 000000000..c27a13654
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_026.cfg
@@ -0,0 +1,16 @@
+[onecc]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=True
+one-pack=False
+one-codegen=False
+
+[one-quantize]
+input_path=inception_v3.circle
+output_path=inception_v3.onecc_026.q.circle
+input_data=inception_v3_test_data.h5
+evaluate_result=True
+test_data=inception_v3_test_data.h5
+print_mpeir=True
diff --git a/compiler/one-cmds/tests/onecc_026.test b/compiler/one-cmds/tests/onecc_026.test
new file mode 100644
index 000000000..84cfa4146
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_026.test
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "MPEIR for InceptionV3/Predictions/Reshape_1 is" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_026.cfg"
+outputfile="inception_v3.onecc_026.q.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -C ${configfile} > ${filename}.log 2>&1
+
+check_message
diff --git a/compiler/one-cmds/tests/onecc_027.cfg b/compiler/one-cmds/tests/onecc_027.cfg
new file mode 100644
index 000000000..d3f6b5e82
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_027.cfg
@@ -0,0 +1,15 @@
+[onecc]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+one-profile=False
+one-infer=True
+
+[one-infer]
+backend=dummy
+command=test_onnx_model.bin
diff --git a/compiler/one-cmds/tests/onecc_027.test b/compiler/one-cmds/tests/onecc_027.test
new file mode 100644
index 000000000..e727359ef
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_027.test
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-infer
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-profile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_027.cfg"
+
+# copy dummy-infer to bin folder
+cp dummy-infer ../bin/dummy-infer
+
+# run test
+onecc -C ${configfile} > ${filename}.log
+
+rm -rf ../bin/dummy-infer
+
+if grep -q "dummy-infer dummy output!!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/onecc_028.test b/compiler/one-cmds/tests/onecc_028.test
new file mode 100644
index 000000000..10ce1583b
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_028.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-optimize -> one-pack
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_028.workflow.json"
+outputfile="inception_v3_pkg"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_028.workflow.json b/compiler/one-cmds/tests/onecc_028.workflow.json
new file mode 100644
index 000000000..84bfd01fa
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_028.workflow.json
@@ -0,0 +1,37 @@
+{
+    "workflows": [
+        "MY_WORKFLOW"
+    ],
+    "MY_WORKFLOW": {
+        "steps": [
+            "IMPORT_TF",
+            "OPTIMIZE",
+            "PACK"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "OPTIMIZE": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        },
+        "PACK": {
+            "one-cmd": "one-pack",
+            "commands": {
+                "input_path": "inception_v3.opt.circle",
+                "output_path": "inception_v3_pkg"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_029.test b/compiler/one-cmds/tests/onecc_029.test
new file mode 100644
index 000000000..9bab1a1ee
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_029.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-quantize
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_029.workflow.json"
+outputfile="inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_029.workflow.json b/compiler/one-cmds/tests/onecc_029.workflow.json
new file mode 100644
index 000000000..65c9ea662
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_029.workflow.json
@@ -0,0 +1,30 @@
+{
+    "workflows": [
+        "QUANTIZE_WORKFLOW"
+    ],
+    "QUANTIZE_WORKFLOW": {
+        "steps": [
+            "IMPORT_TF",
+            "QUANTIZE"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.quantized.circle",
+                "input_data": "inception_v3_test_data.h5"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_030.test b/compiler/one-cmds/tests/onecc_030.test
new file mode 100644
index 000000000..c0aa56a51
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_030.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_030.workflow.json"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_030.workflow.json b/compiler/one-cmds/tests/onecc_030.workflow.json
new file mode 100644
index 000000000..111a1b034
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_030.workflow.json
@@ -0,0 +1,29 @@
+{
+    "workflows": [
+        "codegen_wf"
+    ],
+    "codegen_wf": {
+        "steps": [
+            "import_tf",
+            "codegen"
+        ],
+        "import_tf": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "codegen": {
+            "one-cmd": "one-codegen",
+            "commands": {
+                "backend": "dummy",
+                "command": "-o sample.tvn inception_v3.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_031.test b/compiler/one-cmds/tests/onecc_031.test
new file mode 100644
index 000000000..7a1c670c8
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_031.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tflite -> one-optimize -> one-codgen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_031.workflow.json"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_031.workflow.json b/compiler/one-cmds/tests/onecc_031.workflow.json
new file mode 100644
index 000000000..83d52b942
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_031.workflow.json
@@ -0,0 +1,33 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import",
+            "optimize",
+            "codegen"
+        ],
+        "import": {
+            "one-cmd": "one-import-tflite",
+            "commands": {
+                "input_path": "inception_v3.tflite",
+                "output_path": "inception_v3.circle"
+            }
+        },
+        "optimize": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        },
+        "codegen": {
+            "one-cmd": "one-codegen",
+            "commands": {
+                "backend": "dummy",
+                "command": "-o sample.tvn inception_v3.opt.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_032.test b/compiler/one-cmds/tests/onecc_032.test
new file mode 100644
index 000000000..89b6c41a5
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_032.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-optimize -> one-quantize -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_032.workflow.json"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_032.workflow.json b/compiler/one-cmds/tests/onecc_032.workflow.json
new file mode 100644
index 000000000..08d3f0f5c
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_032.workflow.json
@@ -0,0 +1,42 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import",
+            "optimize",
+            "quantize",
+            "codegen"
+        ],
+        "import": {
+            "one-cmd": "one-import-tflite",
+            "commands": {
+                "input_path": "inception_v3.tflite",
+                "output_path": "inception_v3.circle"
+            }
+        },
+        "optimize": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        },
+        "quantize": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.quantized.circle",
+                "input_data": "inception_v3_test_data.h5"
+            }
+        },
+        "codegen": {
+            "one-cmd": "one-codegen",
+            "commands": {
+                "backend": "dummy",
+                "command": "-o sample.tvn inception_v3.quantized.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_033.test b/compiler/one-cmds/tests/onecc_033.test
new file mode 100644
index 000000000..635582f61
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_033.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-optimize -> one-quantize -> one-pack
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_033.workflow.json"
+outputfile="inception_v3_pkg"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_033.workflow.json b/compiler/one-cmds/tests/onecc_033.workflow.json
new file mode 100644
index 000000000..01233ffd9
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_033.workflow.json
@@ -0,0 +1,42 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import",
+            "optimize",
+            "quantize",
+            "pack"
+        ],
+        "import": {
+            "one-cmd": "one-import-tflite",
+            "commands": {
+                "input_path": "inception_v3.tflite",
+                "output_path": "inception_v3.circle"
+            }
+        },
+        "optimize": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        },
+        "quantize": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.quantized.circle",
+                "input_data": "inception_v3_test_data.h5"
+            }
+        },
+        "pack": {
+            "one-cmd": "one-pack",
+            "commands": {
+                "input_path": "inception_v3.quantized.circle",
+                "output_path": "inception_v3_pkg"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_034.test b/compiler/one-cmds/tests/onecc_034.test
new file mode 100644
index 000000000..e76654809
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_034.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-onnx -> one-optimize -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_034.workflow.json"
+outputfile="onnx_conv2d_conv2d.bin"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_034.workflow.json b/compiler/one-cmds/tests/onecc_034.workflow.json
new file mode 100644
index 000000000..bc3cbbf58
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_034.workflow.json
@@ -0,0 +1,35 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import",
+            "optimize",
+            "codegen"
+        ],
+        "import": {
+            "one-cmd": "one-import-onnx",
+            "commands": {
+                "input_path": "onnx_conv2d_conv2d.onnx",
+                "output_path": "onnx_conv2d_conv2d.circle"
+            }
+        },
+        "optimize": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "onnx_conv2d_conv2d.circle",
+                "output_path": "onnx_conv2d_conv2d.opt.circle",
+                "remove_redundant_transpose": "True",
+                "convert_nchw_to_nhwc": "True"
+            }
+        },
+        "codegen": {
+            "one-cmd": "one-codegen",
+            "commands": {
+                "backend": "dummy",
+                "command": "-o onnx_conv2d_conv2d.bin onnx_conv2d_conv2d.opt.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_035.test b/compiler/one-cmds/tests/onecc_035.test
new file mode 100644
index 000000000..762cdd31a
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_035.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf generates intermediate files
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_035.workflow.json"
+outputfile="inception_v3.alt.circle"
+intermfile="inception_v3.alt.tflite"
+
+rm -rf ${outputfile}
+rm -rf ${intermfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+if [[ ! -s "${intermfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_035.workflow.json b/compiler/one-cmds/tests/onecc_035.workflow.json
new file mode 100644
index 000000000..6abf1f32b
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_035.workflow.json
@@ -0,0 +1,22 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import"
+        ],
+        "import": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.alt.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v1",
+                "save_intermediate": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_036.test b/compiler/one-cmds/tests/onecc_036.test
new file mode 100644
index 000000000..865255e9f
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_036.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-onnx generates intermediate files
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_036.workflow.json"
+outputfile="test_onnx_model.circle"
+intermfile="test_onnx_model.tflite"
+
+rm -rf ${outputfile}
+rm -rf ${intermfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+if [[ ! -s "${intermfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_036.workflow.json b/compiler/one-cmds/tests/onecc_036.workflow.json
new file mode 100644
index 000000000..5fa29edb5
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_036.workflow.json
@@ -0,0 +1,18 @@
+{
+    "workflows": [
+        "wf"
+    ],
+    "wf": {
+        "steps": [
+            "import"
+        ],
+        "import": {
+            "one-cmd": "one-import-onnx",
+            "commands": {
+                "input_path": "test_onnx_model.onnx",
+                "output_path": "test_onnx_model.circle",
+                "save_intermediate": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_037.test b/compiler/one-cmds/tests/onecc_037.test
new file mode 100644
index 000000000..52ea9e4c7
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_037.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-optimize
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_037.workflow.json"
+outputfile="inception_v3.opt.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_037.workflow.json b/compiler/one-cmds/tests/onecc_037.workflow.json
new file mode 100644
index 000000000..3317fb27a
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_037.workflow.json
@@ -0,0 +1,29 @@
+{
+    "workflows": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOW": {
+        "steps": [
+            "IMPORT",
+            "OPTIMIZE"
+        ],
+        "IMPORT": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "OPTIMIZE": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_038.test b/compiler/one-cmds/tests/onecc_038.test
new file mode 100644
index 000000000..6b8f7cf64
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_038.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-import-tf -> one-quantize
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_038.workflow.json"
+outputfile="inception_v3.list.quantized.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_038.workflow.json b/compiler/one-cmds/tests/onecc_038.workflow.json
new file mode 100644
index 000000000..5ac515d00
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_038.workflow.json
@@ -0,0 +1,31 @@
+{
+    "workflows": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOW": {
+        "steps": [
+            "IMPORT",
+            "QUANTIZE"
+        ],
+        "IMPORT": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.list.quantized.circle",
+                "input_data": "datalist.txt",
+                "input_data_format": "list"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_039.test b/compiler/one-cmds/tests/onecc_039.test
new file mode 100644
index 000000000..7db9d901c
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_039.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow where one-quantize quantizes the model and evaluates the result
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "MPEIR for InceptionV3/Predictions/Reshape_1 is" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_039.workflow.json"
+outputfile="inception_v3.onecc_039.q.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+check_message
diff --git a/compiler/one-cmds/tests/onecc_039.workflow.json b/compiler/one-cmds/tests/onecc_039.workflow.json
new file mode 100644
index 000000000..55ef56988
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_039.workflow.json
@@ -0,0 +1,21 @@
+{
+    "workflows": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOW": {
+        "steps": [
+            "QUANTIZE"
+        ],
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.onecc_026.q.circle",
+                "input_data": "inception_v3_test_data.h5",
+                "evaluate_result": "True",
+                "test_data": "inception_v3_test_data.h5",
+                "print_mpeir": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_040.cfg b/compiler/one-cmds/tests/onecc_040.cfg
new file mode 100644
index 000000000..4776ea80e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_040.cfg
@@ -0,0 +1,20 @@
+[onecc]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v2
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
diff --git a/compiler/one-cmds/tests/onecc_040.test b/compiler/one-cmds/tests/onecc_040.test
new file mode 100644
index 000000000..2f7567730
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_040.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflow with cfg reference
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_040.workflow.json"
+outputfile="inception_v3.opt.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_040.workflow.json b/compiler/one-cmds/tests/onecc_040.workflow.json
new file mode 100644
index 000000000..2d4119b21
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_040.workflow.json
@@ -0,0 +1,10 @@
+{
+    "workflows": [
+        "MY_WORKFLOW"
+    ],
+    "MY_WORKFLOW": {
+        "cfg-reference": {
+            "path": "onecc_040.cfg"
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_041.cfg b/compiler/one-cmds/tests/onecc_041.cfg
new file mode 100644
index 000000000..16135f074
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_041.cfg
@@ -0,0 +1,16 @@
+[onecc]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3_without_opt.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v2
diff --git a/compiler/one-cmds/tests/onecc_041.test b/compiler/one-cmds/tests/onecc_041.test
new file mode 100644
index 000000000..791dd12ca
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_041.test
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a workflows
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+check_message()
+{
+  if grep -q "Do inference of inception_v3_without_opt\.circle" "${filename}.log" &&
+  grep -q "Do inference of inception_v3\.opt\.circle" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  trap_err_onexit
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_041.workflow.json"
+outputfile1="inception_v3_without_opt.circle"
+outputfile2="inception_v3.opt.circle"
+
+cp dummy-inferV2 ../bin/dummy-inferV2
+
+rm -rf ${outputfile1} {outputfile2}
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+rm -rf ../bin/dummy-inferV2
+
+if [[ ! -s "${outputfile1}" ]] && [[ ! -s "${outputfile2}" ]]; then
+  trap_err_onexit
+fi
+
+check_message
diff --git a/compiler/one-cmds/tests/onecc_041.workflow.json b/compiler/one-cmds/tests/onecc_041.workflow.json
new file mode 100644
index 000000000..7dfc1c664
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_041.workflow.json
@@ -0,0 +1,61 @@
+{
+    "workflows": [
+        "WITHOUT_OPT",
+        "WITH_OPT",
+        "INFER"
+    ],
+    "INFER": {
+        "run-after": [
+            "WITHOUT_OPT",
+            "WITH_OPT"
+        ],
+        "steps": [
+            "INFER1",
+            "INFER2"
+        ],
+        "INFER1": {
+            "one-cmd": "one-infer",
+            "commands" : {
+                "driver": "dummy-inferV2",
+                "command": "inception_v3_without_opt.circle"
+            }
+        },
+        "INFER2": {
+            "one-cmd": "one-infer",
+            "commands": {
+                "driver": "dummy-inferV2",
+                "command": "inception_v3.opt.circle"
+            }
+        }
+    },
+    "WITHOUT_OPT": {
+        "cfg-reference": {
+            "path": "onecc_041.cfg"
+        }
+    },
+    "WITH_OPT": {
+        "steps": [
+            "IMPORT_TF",
+            "OPTIMIZE"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "OPTIMIZE": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        }
+    }
+    
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_009.test b/compiler/one-cmds/tests/onecc_neg_009.test
new file mode 100644
index 000000000..54dd129e4
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_009.test
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Valid optimization option but invalid configuration file path
+
+: '
+This test assumes below directories.
+
+[one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    └── test # pwd
+'
+
+OPT_ALREADY_EXIST=true
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  rm -rf ../optimization/OONECC_NEG_009.cfg
+  if [ "$OPT_ALREADY_EXIST" = false ]; then
+    rm -rf ../optimization
+  fi
+  if grep -q "Not found given configuration file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+if [ ! -d "../optimization" ]; then
+  mkdir -p ../optimization
+  OPT_ALREADY_EXIST=false
+fi
+
+
+touch ../optimization/OONECC_NEG_009.cfg
+
+configfile=".."
+
+# run test
+onecc -C ${configfile} -OONECC_NEG_009 > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_010.test b/compiler/one-cmds/tests/onecc_neg_010.test
new file mode 100644
index 000000000..ddad5e6de
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_010.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Invalid optimization option
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Invalid optimization option" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile=".."
+
+# run test
+onecc -C ${configfile} -OONECC_NEG_010 > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_011.cfg b/compiler/one-cmds/tests/onecc_neg_011.cfg
new file mode 100644
index 000000000..b5873245b
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_011.cfg
@@ -0,0 +1,13 @@
+[onecc]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
+wrong_opt=True
diff --git a/compiler/one-cmds/tests/onecc_neg_011.test b/compiler/one-cmds/tests/onecc_neg_011.test
new file mode 100644
index 000000000..3f043a77e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_011.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# generate error for unrecognized opitmization option
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "following arguments are unrecognized" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_neg_011.cfg"
+
+# run test
+onecc -C ${configfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_012.cfg b/compiler/one-cmds/tests/onecc_neg_012.cfg
new file mode 100644
index 000000000..fdc73ef43
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_012.cfg
@@ -0,0 +1,15 @@
+[onecc]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+one-profile=False
+one-infer=True
+
+[one-infer]
+driver=dummy-infer
+backend=dummy
+command="dummy arguments"
diff --git a/compiler/one-cmds/tests/onecc_neg_012.test b/compiler/one-cmds/tests/onecc_neg_012.test
new file mode 100644
index 000000000..9feca5f54
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_012.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Check driver and backend option is mutually exclusive
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "\-d and -b options are mutually exclusive" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_neg_012.cfg"
+
+# run test
+onecc -C ${configfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_013.test b/compiler/one-cmds/tests/onecc_neg_013.test
new file mode 100644
index 000000000..0dd8a0fdd
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_013.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with missing workflow file
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Not found given workflow file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_013.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_014.test b/compiler/one-cmds/tests/onecc_neg_014.test
new file mode 100644
index 000000000..2ed5dcbf5
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_014.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# invalid workflow file
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Invalid workflow file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_014.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_014.workflow.json b/compiler/one-cmds/tests/onecc_neg_014.workflow.json
new file mode 100644
index 000000000..8d4fd431e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_014.workflow.json
@@ -0,0 +1,3 @@
+{
+    INVALID JSON FILE
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_015.test b/compiler/one-cmds/tests/onecc_neg_015.test
new file mode 100644
index 000000000..079ba677a
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_015.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Not found" "${filename}.log" &&
+  grep -q "key in workflow file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_015.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_015.workflow.json b/compiler/one-cmds/tests/onecc_neg_015.workflow.json
new file mode 100644
index 000000000..4cb752e4e
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_015.workflow.json
@@ -0,0 +1,21 @@
+{
+    "workflowsssssss": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOW": {
+        "steps": [
+            "QUANTIZE"
+        ],
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.onecc_026.q.circle",
+                "input_data": "inception_v3_test_data.h5",
+                "evaluate_result": "True",
+                "test_data": "inception_v3_test_data.h5",
+                "print_mpeir": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_016.test b/compiler/one-cmds/tests/onecc_neg_016.test
new file mode 100644
index 000000000..c52763f47
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_016.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Not found" "${filename}.log" &&
+  grep -q "key listed in" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_016.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_016.workflow.json b/compiler/one-cmds/tests/onecc_neg_016.workflow.json
new file mode 100644
index 000000000..c929cf38c
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_016.workflow.json
@@ -0,0 +1,21 @@
+{
+    "workflows": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOWWWWW": {
+        "steps": [
+            "QUANTIZE"
+        ],
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.onecc_026.q.circle",
+                "input_data": "inception_v3_test_data.h5",
+                "evaluate_result": "True",
+                "test_data": "inception_v3_test_data.h5",
+                "print_mpeir": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_017.test b/compiler/one-cmds/tests/onecc_neg_017.test
new file mode 100644
index 000000000..2f173d2f6
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_017.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Each workflow should have either" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_017.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_017.workflow.json b/compiler/one-cmds/tests/onecc_neg_017.workflow.json
new file mode 100644
index 000000000..22f1415e9
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_017.workflow.json
@@ -0,0 +1,18 @@
+{
+    "workflows": [
+        "SIMPLE_WORKFLOW"
+    ],
+    "SIMPLE_WORKFLOW": {
+        "QUANTIZE": {
+            "one-cmd": "one-quantize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.onecc_026.q.circle",
+                "input_data": "inception_v3_test_data.h5",
+                "evaluate_result": "True",
+                "test_data": "inception_v3_test_data.h5",
+                "print_mpeir": "True"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_018.test b/compiler/one-cmds/tests/onecc_neg_018.test
new file mode 100644
index 000000000..bc2297ed0
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_018.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "are exclusive key" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_018.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_018.workflow.json b/compiler/one-cmds/tests/onecc_neg_018.workflow.json
new file mode 100644
index 000000000..58cb88e17
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_018.workflow.json
@@ -0,0 +1,24 @@
+{
+    "workflows": [
+        "MY_WORKFLOW"
+    ],
+    "MY_WORKFLOW": {
+        "steps": [
+            "IMPORT_TF"
+        ],
+        "cfg-reference": {
+            "path": "/path/to/ini/format/file"
+        },
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_019.test b/compiler/one-cmds/tests/onecc_neg_019.test
new file mode 100644
index 000000000..11ef3a9ee
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_019.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Each step should have" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_019.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_019.workflow.json b/compiler/one-cmds/tests/onecc_neg_019.workflow.json
new file mode 100644
index 000000000..aedeeecca
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_019.workflow.json
@@ -0,0 +1,21 @@
+{
+    "workflows": [
+        "MY_WORKFLOW"
+    ],
+    "MY_WORKFLOW": {
+        "steps": [
+            "IMPORT_TF"
+        ],
+        "IMPORT_TF": {
+            "one-cmddddddddd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_020.test b/compiler/one-cmds/tests/onecc_neg_020.test
new file mode 100644
index 000000000..7f5073d82
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_020.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflow file has invalid key
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Each step should have" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_020.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_020.workflow.json b/compiler/one-cmds/tests/onecc_neg_020.workflow.json
new file mode 100644
index 000000000..d3446d38f
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_020.workflow.json
@@ -0,0 +1,21 @@
+{
+    "workflows": [
+        "MY_WORKFLOW"
+    ],
+    "MY_WORKFLOW": {
+        "steps": [
+            "IMPORT_TF"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commandssssssssss": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_021.test b/compiler/one-cmds/tests/onecc_neg_021.test
new file mode 100644
index 000000000..e9d4baaee
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_021.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflows have a cycle
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Workflows should not have a cycle" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_021.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_021.workflow.json b/compiler/one-cmds/tests/onecc_neg_021.workflow.json
new file mode 100644
index 000000000..6d21111af
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_021.workflow.json
@@ -0,0 +1,44 @@
+{
+    "workflows": [
+        "CYCLE_WF1",
+        "CYCLE_WF2"
+    ],
+    "CYCLE_WF1": {
+        "run-after": [
+            "CYCLE_WF2"
+        ],
+        "steps": [
+            "IMPORT_TF"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        }
+    },
+    "CYCLE_WF2": {
+        "run-after": [
+            "CYCLE_WF1"
+        ],
+        "steps": [
+            "IMPORT_TF"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_022.cfg b/compiler/one-cmds/tests/onecc_neg_022.cfg
new file mode 100644
index 000000000..16135f074
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_022.cfg
@@ -0,0 +1,16 @@
+[onecc]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3_without_opt.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v2
diff --git a/compiler/one-cmds/tests/onecc_neg_022.test b/compiler/one-cmds/tests/onecc_neg_022.test
new file mode 100644
index 000000000..540071729
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_022.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflows have a cycle
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Workflows should not have a cycle" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_022.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_022.workflow.json b/compiler/one-cmds/tests/onecc_neg_022.workflow.json
new file mode 100644
index 000000000..2e056acf1
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_022.workflow.json
@@ -0,0 +1,63 @@
+{
+    "workflows": [
+        "WITHOUT_OPT",
+        "WITH_OPT",
+        "INFER"
+    ],
+    "INFER": {
+        "run-after": [
+            "WITHOUT_OPT",
+            "WITH_OPT"
+        ],
+        "steps": [
+            "INFER1",
+            "INFER2"
+        ],
+        "INFER1": {
+            "one-cmd": "one-infer",
+            "commands" : {
+                "driver": "dummy-inferV2",
+                "command": "inception_v3_without_opt.circle"
+            }
+        },
+        "INFER2": {
+            "one-cmd": "one-infer",
+            "commands": {
+                "driver": "dummy-inferV2",
+                "command": "inception_v3.opt.circle"
+            }
+        }
+    },
+    "WITHOUT_OPT": {
+        "cfg-reference": {
+            "path": "onecc_041.cfg"
+        }
+    },
+    "WITH_OPT": {
+        "run-after": [
+            "WITHOUT_OPT"
+        ],
+        "steps": [
+            "IMPORT_TF",
+            "OPTIMIZE"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "OPTIMIZE": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/onecc_neg_023.test b/compiler/one-cmds/tests/onecc_neg_023.test
new file mode 100644
index 000000000..09717e8ad
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_023.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workflows have wrong optimize option
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "Change outputs failed" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+workflowfile="onecc_neg_023.workflow.json"
+
+# run test
+onecc -W ${workflowfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_023.workflow.json b/compiler/one-cmds/tests/onecc_neg_023.workflow.json
new file mode 100644
index 000000000..056e704fd
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_023.workflow.json
@@ -0,0 +1,30 @@
+{
+    "workflows": [
+        "WITH_OPT"
+    ],
+    "WITH_OPT": {
+        "steps": [
+            "IMPORT_TF",
+            "OPTIMIZE"
+        ],
+        "IMPORT_TF": {
+            "one-cmd": "one-import-tf",
+            "commands": {
+                "input_path": "inception_v3.pb",
+                "output_path": "inception_v3.circle",
+                "input_arrays": "input",
+                "input_shapes": "1,299,299,3",
+                "output_arrays": "InceptionV3/Predictions/Reshape_1",
+                "converter_version": "v2"
+            }
+        },
+        "OPTIMIZE": {
+            "one-cmd": "one-optimize",
+            "commands": {
+                "input_path": "inception_v3.circle",
+                "output_path": "inception_v3.opt.circle",
+                "change_outputs": "non_existing_node_name"
+            }
+        }
+    }
+}
diff --git a/compiler/one-cmds/tests/prepare_test_materials.sh b/compiler/one-cmds/tests/prepare_test_materials.sh
index c80c59834..c171cfe01 100644
--- a/compiler/one-cmds/tests/prepare_test_materials.sh
+++ b/compiler/one-cmds/tests/prepare_test_materials.sh
@@ -91,6 +91,20 @@ if [[ ! -s "onnx_conv2d_conv2d.onnx" ]]; then
     # https://github.com/Samsung/ONE/issues/5577#issuecomment-755078444
 fi
 
+if [[ ! -s "reshape_matmul.onnx" ]]; then
+    rm -rf reshape_matmul.zip
+    wget https://github.com/Samsung/ONE/files/9082878/reshape_matmul.zip
+    unzip reshape_matmul.zip
+    # https://github.com/Samsung/ONE/issues/9405#issuecomment-1180198137
+fi
+
+if [[ ! -s "Net_InstanceNorm_003.part" ]]; then
+    rm -rf Net_InstanceNorm_003.zip
+    wget https://github.com/Samsung/ONE/files/8608844/Net_InstanceNorm_003.zip
+    unzip Net_InstanceNorm_003.zip
+    # https://github.com/Samsung/ONE/issues/8570#issuecomment-1115804257
+fi
+
 function files_missing() {
     condition="test "
 
diff --git a/compiler/one-cmds/utils.py b/compiler/one-cmds/utils.py
index be0322aca..d204447fd 100644
--- a/compiler/one-cmds/utils.py
+++ b/compiler/one-cmds/utils.py
@@ -47,6 +47,25 @@ def _add_default_arg(parser):
     parser.add_argument('-S', '--section', type=str, help=argparse.SUPPRESS)
 
 
+def _add_default_arg_no_CS(parser):
+    """
+    This adds -v -V args only (no -C nor -S)
+    """
+    # version
+    parser.add_argument(
+        '-v',
+        '--version',
+        action='store_true',
+        help='show program\'s version number and exit')
+
+    # verbose
+    parser.add_argument(
+        '-V',
+        '--verbose',
+        action='store_true',
+        help='output additional information to stdout or stderr')
+
+
 def is_accumulated_arg(arg, driver):
     if driver == "one-quantize":
         accumulables = [
@@ -62,6 +81,43 @@ def _is_valid_attr(args, attr):
     return hasattr(args, attr) and getattr(args, attr)
 
 
+class Command:
+    def __init__(self, driver, args, log_file):
+        self.cmd = [driver]
+        self.driver = driver
+        self.args = args
+        self.log_file = log_file
+
+    # Add option if attrs are valid
+    # Option values are collected from self.args
+    def add_option_with_valid_args(self, option, attrs):
+        for attr in attrs:
+            if not _is_valid_attr(self.args, attr):
+                return self
+        self.cmd.append(option)
+        for attr in attrs:
+            self.cmd.append(getattr(self.args, attr))
+        return self
+
+    # Add option and values without any condition
+    def add_option_with_values(self, option, values):
+        self.cmd.append(option)
+        for value in values:
+            self.cmd.append(value)
+        return self
+
+    # Add option with no argument (ex: --verbose) if attr is valid
+    def add_noarg_option_if_valid_arg(self, option, attr):
+        if _is_valid_attr(self.args, attr):
+            self.cmd.append(option)
+        return self
+
+    # Run cmd and save logs
+    def run(self):
+        self.log_file.write((' '.join(self.cmd) + '\n').encode())
+        _run(self.cmd, err_prefix=self.driver, logfile=self.log_file)
+
+
 def _parse_cfg_and_overwrite(config_path, section, args):
     """
     parse given section of configuration file and set the values of args.
@@ -153,8 +209,7 @@ def _run(cmd, err_prefix=None, logfile=None):
         err_prefix: prefix to be put before every stderr lines
         logfile: file stream to which both of stdout and stderr lines will be written
     """
-    with subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) as p:
+    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
         import select
         inputs = set([p.stdout, p.stderr])
         while inputs:
diff --git a/compiler/onnx-tools/CMakeLists.txt b/compiler/onnx-tools/CMakeLists.txt
index ac4500e0e..5935cdfbe 100644
--- a/compiler/onnx-tools/CMakeLists.txt
+++ b/compiler/onnx-tools/CMakeLists.txt
@@ -18,4 +18,10 @@ foreach(ONNX_TOOL IN ITEMS ${ONNX_TOOL_FILES})
 
   add_custom_target(${ONNX_TOOL_TARGET} ALL DEPENDS ${ONNX_TOOL_BIN})
 
+  install(FILES ${ONNX_TOOL_BIN}
+          PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                      GROUP_READ GROUP_EXECUTE
+                      WORLD_READ WORLD_EXECUTE
+          DESTINATION bin)
+
 endforeach(ONNX_TOOL)
diff --git a/compiler/pota-quantization-value-test/CMakeLists.txt b/compiler/pota-quantization-value-test/CMakeLists.txt
index 51fd9a391..96dfc8687 100644
--- a/compiler/pota-quantization-value-test/CMakeLists.txt
+++ b/compiler/pota-quantization-value-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 unset(QUANTIZATION_VALUE_TEST)
 unset(QUANTIZATION_VALUE_TEST_WITH_PARAM)
 unset(QUANTIZATION_CONFIG_VALUE_TEST)
diff --git a/compiler/record-minmax-conversion-test/CMakeLists.txt b/compiler/record-minmax-conversion-test/CMakeLists.txt
index 31b906142..636361405 100644
--- a/compiler/record-minmax-conversion-test/CMakeLists.txt
+++ b/compiler/record-minmax-conversion-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 unset(RECORD_MINMAX_CONVERSION_TEST)
 
 macro(addTest NAME)
diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index c9f1d0ca7..faa402f01 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -34,62 +34,33 @@ int entry(const int argc, char **argv)
   arser::Arser arser(
     "Embedding min/max values of activations to the circle model for post-training quantization");
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
-
-  arser.add_argument("-V", "--verbose")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("output additional information to stdout or stderr");
+  arser::Helper::add_version(arser, print_version);
+  arser::Helper::add_verbose(arser);
 
-  arser.add_argument("--input_model")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(true)
-    .help("Input model filepath");
+  arser.add_argument("--input_model").required(true).help("Input model filepath");
 
   arser.add_argument("--input_data")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(false)
     .help("Input data filepath. If not given, record-minmax will run with randomly generated data. "
           "Note that the random dataset does not represent inference workload, leading to poor "
           "model accuracy.");
 
-  arser.add_argument("--output_model")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .required(true)
-    .help("Output model filepath");
+  arser.add_argument("--output_model").required(true).help("Output model filepath");
 
   arser.add_argument("--min_percentile")
-    .nargs(1)
     .type(arser::DataType::FLOAT)
     .help("Record n'th percentile of min");
 
   arser.add_argument("--max_percentile")
-    .nargs(1)
     .type(arser::DataType::FLOAT)
     .help("Record n'th percentile of max");
 
-  arser.add_argument("--mode")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Record mode. percentile (default) or moving_average");
+  arser.add_argument("--mode").help("Record mode. percentile (default) or moving_average");
 
   arser.add_argument("--input_data_format")
-    .nargs(1)
-    .type(arser::DataType::STR)
     .help("Input data format. h5/hdf5 (default) or list/filelist");
 
   arser.add_argument("--generate_profile_data")
     .nargs(0)
-    .required(false)
     .default_value(false)
     .help("This will turn on profiling data generation.");
 
diff --git a/compiler/record-minmax/include/RecordFunction.h b/compiler/record-minmax/include/RecordFunction.h
index ba199d071..5b993e4b3 100644
--- a/compiler/record-minmax/include/RecordFunction.h
+++ b/compiler/record-minmax/include/RecordFunction.h
@@ -18,7 +18,7 @@
 #include <cassert>
 #include <algorithm>
 #include <cmath>
-#include <numeric>
+#include <limits>
 #include <stdexcept>
 
 namespace record_minmax
diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
index 8288d3e5e..e6edbdca9 100644
--- a/compiler/record-minmax/src/MinMaxObserver.cpp
+++ b/compiler/record-minmax/src/MinMaxObserver.cpp
@@ -18,6 +18,7 @@
 
 #include <luci/IR/CircleOpcode.h>
 
+#include <limits>
 #include <math.h>
 
 using DataType = luci_interpreter::DataType;
@@ -75,7 +76,7 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
         // Reshape changes only shape of input tensor, efficiently is it a no-op.
         return;
       default:
-        throw std::runtime_error("Tensor's data type is not float");
+        throw std::runtime_error("Tensor's data type is not float. " + node->name());
     }
   }
 
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index 10a14516f..6dbf98dc6 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -186,7 +186,13 @@ void RecordMinMax::initialize(const std::string &input_model_path)
     throw std::runtime_error("Failed to verify circle '" + input_model_path + "'");
   }
 
-  _module = luci::Importer().importModule(circle::GetModel(model_data.data()));
+  const circle::Model *circle_model = circle::GetModel(model_data.data());
+  if (circle_model == nullptr)
+  {
+    throw std::runtime_error("Failed to load '" + input_model_path + "'");
+  }
+
+  _module = luci::Importer().importModule(circle_model);
 
   if (_module == nullptr)
   {
diff --git a/compiler/souschef/CMakeLists.txt b/compiler/souschef/CMakeLists.txt
index f57102f1f..8dcf4c2b8 100644
--- a/compiler/souschef/CMakeLists.txt
+++ b/compiler/souschef/CMakeLists.txt
@@ -1,13 +1,20 @@
 nnas_find_package(Protobuf QUIET)
+nnas_find_package(Fp16Source QUIET)
 
 if(NOT Protobuf_FOUND)
   message(STATUS "Build souschef: FAILED (missing Protobuf)")
   return()
 endif(NOT Protobuf_FOUND)
 
+if(NOT Fp16Source_FOUND)
+  message(STATUS "Build souschef: FAILED (missing Fp16Source)")
+  return()
+endif(NOT Fp16Source_FOUND)
+
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_library(souschef STATIC ${SOURCES})
 set_target_properties(souschef PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(souschef PRIVATE ${Fp16Source_DIR}/include)
 target_include_directories(souschef PUBLIC include)
 target_link_libraries(souschef PUBLIC libprotobuf)
diff --git a/compiler/souschef/include/souschef/Data/Explicit.h b/compiler/souschef/include/souschef/Data/Explicit.h
index 7cbb773da..434d0ec2c 100644
--- a/compiler/souschef/include/souschef/Data/Explicit.h
+++ b/compiler/souschef/include/souschef/Data/Explicit.h
@@ -96,6 +96,41 @@ template <typename T> struct ExplicitDataChefFactory : public DataChefFactory
   }
 };
 
+class ExplicitFloat16DataChef final : public DataChef
+{
+public:
+  ExplicitFloat16DataChef()
+  {
+    // DO NOTHING
+  }
+
+public:
+  std::vector<uint8_t> generate(int32_t count) const override;
+
+public:
+  void insert(const float &value) { _values.emplace_back(value); }
+
+private:
+  // NOTE store values in float but will convert to uint16_t in generate()
+  std::vector<float> _values;
+};
+
+struct ExplicitFloat16DataChefFactory : public DataChefFactory
+{
+  std::unique_ptr<DataChef> create(const Arguments &args) const
+  {
+    std::unique_ptr<ExplicitFloat16DataChef> res{new ExplicitFloat16DataChef};
+
+    for (uint32_t n = 0; n < args.count(); ++n)
+    {
+      auto const value = to_number<float>(args.value(n));
+      res->insert(value);
+    }
+
+    return std::move(res);
+  }
+};
+
 } // namespace souschef
 
 #endif // __SOUSCHEF_DATA_EXPLICIT_H__
diff --git a/compiler/souschef/include/souschef/Data/Gaussian.h b/compiler/souschef/include/souschef/Data/Gaussian.h
index 8093b4c41..c9ac571f9 100644
--- a/compiler/souschef/include/souschef/Data/Gaussian.h
+++ b/compiler/souschef/include/souschef/Data/Gaussian.h
@@ -41,6 +41,22 @@ private:
   float _stddev;
 };
 
+class GaussianFloat16DataChef final : public DataChef
+{
+public:
+  GaussianFloat16DataChef(float mean, float stddev) : _mean{mean}, _stddev{stddev}
+  {
+    // DO NOTHING
+  }
+
+public:
+  std::vector<uint8_t> generate(int32_t count) const override;
+
+private:
+  float _mean;
+  float _stddev;
+};
+
 class GaussianInt32DataChef final : public DataChef
 {
 public:
@@ -109,6 +125,11 @@ struct GaussianUint8DataChefFactory : public DataChefFactory
   std::unique_ptr<DataChef> create(const Arguments &args) const;
 };
 
+struct GaussianFloat16DataChefFactory : public DataChefFactory
+{
+  std::unique_ptr<DataChef> create(const Arguments &args) const;
+};
+
 } // namespace souschef
 
 #endif // __SOUSCHEF_DATA_GAUSSIAN_H__
diff --git a/compiler/souschef/src/Explicit.cpp b/compiler/souschef/src/Explicit.cpp
index eb36cb7c3..3278ae3c3 100644
--- a/compiler/souschef/src/Explicit.cpp
+++ b/compiler/souschef/src/Explicit.cpp
@@ -19,6 +19,8 @@
 #include <string>
 #include <vector>
 
+#include <fp16.h>
+
 namespace souschef
 {
 
@@ -74,4 +76,23 @@ void ExplicitDataChef<std::string>::write_value(std::vector<uint8_t> &res, int32
   }
 }
 
+std::vector<uint8_t> ExplicitFloat16DataChef::generate(int32_t count) const
+{
+  std::vector<uint8_t> res;
+
+  for (uint32_t n = 0; n < count; ++n)
+  {
+    float const fvalue = (n < _values.size()) ? _values.at(n) : 0.0;
+    uint16_t const value = fp16_ieee_from_fp32_value(fvalue);
+    auto const arr = reinterpret_cast<const uint8_t *>(&value);
+
+    for (uint32_t b = 0; b < sizeof(uint16_t); ++b)
+    {
+      res.emplace_back(arr[b]);
+    }
+  }
+
+  return res;
+}
+
 } // namespace souschef
diff --git a/compiler/souschef/src/Gaussian.cpp b/compiler/souschef/src/Gaussian.cpp
index 32cbcff4d..53a62cabf 100644
--- a/compiler/souschef/src/Gaussian.cpp
+++ b/compiler/souschef/src/Gaussian.cpp
@@ -23,6 +23,8 @@
 #include <cassert>
 #include <stdexcept>
 
+#include <fp16.h>
+
 namespace souschef
 {
 
@@ -36,7 +38,7 @@ static std::vector<uint8_t> generate_gaussian(int32_t count, float mean, float s
   std::vector<uint8_t> res;
 
   constexpr float max_cap = std::numeric_limits<T>::max();
-  constexpr float min_cap = std::numeric_limits<T>::min();
+  constexpr float min_cap = std::numeric_limits<T>::lowest();
   for (uint32_t n = 0; n < count; ++n)
   {
     float raw_value = dist(rand);
@@ -69,6 +71,34 @@ std::vector<uint8_t> GaussianFloat32DataChef::generate(int32_t count) const
   return generate_gaussian<float>(count, _mean, _stddev);
 }
 
+std::vector<uint8_t> GaussianFloat16DataChef::generate(int32_t count) const
+{
+  auto time_stamp = std::chrono::system_clock::now().time_since_epoch().count();
+  auto seed = static_cast<std::minstd_rand::result_type>(time_stamp);
+
+  std::minstd_rand rand{static_cast<std::minstd_rand::result_type>(seed)};
+  std::normal_distribution<float> dist{_mean, _stddev};
+
+  std::vector<uint8_t> res;
+
+  constexpr float max_cap = 1e9;
+  constexpr float min_cap = -1e9;
+  for (uint32_t n = 0; n < count; ++n)
+  {
+    float raw_value = dist(rand);
+    const float capped_value = std::max(min_cap, std::min(max_cap, raw_value));
+    const uint16_t value = fp16_ieee_from_fp32_value(capped_value);
+    auto const arr = reinterpret_cast<const uint8_t *>(&value);
+
+    for (uint32_t b = 0; b < sizeof(uint16_t); ++b)
+    {
+      res.emplace_back(arr[b]);
+    }
+  }
+
+  return res;
+}
+
 std::vector<uint8_t> GaussianInt32DataChef::generate(int32_t count) const
 {
   return generate_gaussian<int32_t>(count, _mean, _stddev);
@@ -136,4 +166,17 @@ std::unique_ptr<DataChef> GaussianUint8DataChefFactory::create(const Arguments &
   return std::unique_ptr<DataChef>{new GaussianUint8DataChef{mean, stddev}};
 }
 
+std::unique_ptr<DataChef> GaussianFloat16DataChefFactory::create(const Arguments &args) const
+{
+  if (args.count() != 2)
+  {
+    throw std::runtime_error{"invalid argument count: two arguments (mean/stddev) are expected"};
+  }
+
+  auto const mean = to_number<float>(args.value(0));
+  auto const stddev = to_number<float>(args.value(1));
+
+  return std::unique_ptr<DataChef>{new GaussianFloat16DataChef{mean, stddev}};
+}
+
 } // namespace souschef
diff --git a/compiler/tf2circle-conversion-test/CMakeLists.txt b/compiler/tf2circle-conversion-test/CMakeLists.txt
index 27f2463f3..79a39873b 100644
--- a/compiler/tf2circle-conversion-test/CMakeLists.txt
+++ b/compiler/tf2circle-conversion-test/CMakeLists.txt
@@ -128,6 +128,10 @@ list(APPEND TEST_DEPS "${TEST_CONFIG}")
 # This "tf2circle_conversion_test_deps" target enforces CMake to generate all the dependencies during "build" phase
 add_custom_target(tf2circle_conversion_test_deps ALL DEPENDS ${TEST_DEPS})
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 # Run tests
 add_test(
   NAME tf2circle_conversion_test
diff --git a/compiler/tf2circle-dredd-pb-test/CMakeLists.txt b/compiler/tf2circle-dredd-pb-test/CMakeLists.txt
index 48b098e24..83596fade 100644
--- a/compiler/tf2circle-dredd-pb-test/CMakeLists.txt
+++ b/compiler/tf2circle-dredd-pb-test/CMakeLists.txt
@@ -132,6 +132,10 @@ list(APPEND DEPS "${TARGET_RULE_LIB}")
 # Generate dependencies
 add_custom_target(tf2circle_dredd_pb_deps ALL DEPENDS ${DEPS})
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 add_test(
   NAME tf2circle_dredd_pb_test
   COMMAND
diff --git a/compiler/tf2circle-dredd-pbtxt-test/CMakeLists.txt b/compiler/tf2circle-dredd-pbtxt-test/CMakeLists.txt
index 789e58535..427e57502 100644
--- a/compiler/tf2circle-dredd-pbtxt-test/CMakeLists.txt
+++ b/compiler/tf2circle-dredd-pbtxt-test/CMakeLists.txt
@@ -175,6 +175,10 @@ list(APPEND DEPS "${TARGET_RULE_LIB}")
 # Generate dependencies
 add_custom_target(tf2circle_dredd_pbtxt_deps ALL DEPENDS ${DEPS})
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 add_test(
   NAME tf2circle_dredd_pbtxt_test
   COMMAND
diff --git a/compiler/tf2circle-model-test/CMakeLists.txt b/compiler/tf2circle-model-test/CMakeLists.txt
index 2fb82236a..ad776a62b 100644
--- a/compiler/tf2circle-model-test/CMakeLists.txt
+++ b/compiler/tf2circle-model-test/CMakeLists.txt
@@ -100,6 +100,10 @@ list(APPEND DEPS "${TEST_RUNNER_SCRIPT}")
 ### Generate dependencies
 add_custom_target(tf2circle_model_test_deps ALL DEPENDS ${DEPS})
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 # NOTE This target is not built by default
 add_test(
   NAME tf2circle_model_test
diff --git a/compiler/tf2tflite-dredd-pb-test/CMakeLists.txt b/compiler/tf2tflite-dredd-pb-test/CMakeLists.txt
index b75c50772..ac9f14d70 100644
--- a/compiler/tf2tflite-dredd-pb-test/CMakeLists.txt
+++ b/compiler/tf2tflite-dredd-pb-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
diff --git a/compiler/tf2tflite-dredd-pbtxt-test/CMakeLists.txt b/compiler/tf2tflite-dredd-pbtxt-test/CMakeLists.txt
index 87cf7836f..95a296ef8 100644
--- a/compiler/tf2tflite-dredd-pbtxt-test/CMakeLists.txt
+++ b/compiler/tf2tflite-dredd-pbtxt-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
diff --git a/compiler/tf2tflite-value-pb-test/CMakeLists.txt b/compiler/tf2tflite-value-pb-test/CMakeLists.txt
index 41974f72c..a6c451e0b 100644
--- a/compiler/tf2tflite-value-pb-test/CMakeLists.txt
+++ b/compiler/tf2tflite-value-pb-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
diff --git a/compiler/tf2tflite-value-pbtxt-test/CMakeLists.txt b/compiler/tf2tflite-value-pbtxt-test/CMakeLists.txt
index 2e76e21d3..fde3e60b4 100644
--- a/compiler/tf2tflite-value-pbtxt-test/CMakeLists.txt
+++ b/compiler/tf2tflite-value-pbtxt-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
diff --git a/compiler/tf2tfliteV2-conversion-test/CMakeLists.txt b/compiler/tf2tfliteV2-conversion-test/CMakeLists.txt
index 0b4739374..97aa07fd3 100644
--- a/compiler/tf2tfliteV2-conversion-test/CMakeLists.txt
+++ b/compiler/tf2tfliteV2-conversion-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nncc_find_resource(TensorFlowTests)
 
 #
diff --git a/compiler/tf2tfliteV2/tf2tfliteV2.py b/compiler/tf2tfliteV2/tf2tfliteV2.py
index 6b578ad53..2bcf55328 100755
--- a/compiler/tf2tfliteV2/tf2tfliteV2.py
+++ b/compiler/tf2tfliteV2/tf2tfliteV2.py
@@ -110,6 +110,12 @@ def _get_parser():
         type=str,
         help="Names of the output arrays, comma-separated.")
 
+    # experimental options
+    parser.add_argument(
+        "--experimental_disable_batchmatmul_unfold",
+        action="store_true",
+        help="Experimental disable BatchMatMul unfold")
+
     # Set default value
     parser.set_defaults(model_format="graph_def")
     return parser
@@ -228,6 +234,9 @@ def _v2_convert(flags):
         keras_model = tf.keras.models.load_model(flags.input_path)
         converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 
+    if flags.experimental_disable_batchmatmul_unfold:
+        converter._experimental_disable_batchmatmul_unfold = True
+
     converter.allow_custom_ops = True
     converter.experimental_new_converter = True
 
diff --git a/compiler/tfl-inspect/CMakeLists.txt b/compiler/tfl-inspect/CMakeLists.txt
index 9e1cb720f..2c6e3a147 100644
--- a/compiler/tfl-inspect/CMakeLists.txt
+++ b/compiler/tfl-inspect/CMakeLists.txt
@@ -1,6 +1,6 @@
-if(NOT TARGET mio_tflite)
+if(NOT TARGET mio_tflite280)
   return()
-endif(NOT TARGET mio_tflite)
+endif(NOT TARGET mio_tflite280)
 
 set(DRIVER "driver/Driver.cpp")
 
diff --git a/compiler/tfl-inspect/driver/Driver.cpp b/compiler/tfl-inspect/driver/Driver.cpp
index 3e62e0ffb..8505ff4aa 100644
--- a/compiler/tfl-inspect/driver/Driver.cpp
+++ b/compiler/tfl-inspect/driver/Driver.cpp
@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
     .nargs(0)
     .help("Dump Conv2D series weight operators in tflite file");
   arser.add_argument("--op_version").nargs(0).help("Dump versions of the operators in tflite file");
-  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file to inspect");
+  arser.add_argument("tflite").help("TFLite file to inspect");
 
   try
   {
diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
index 2fba335ea..5bead5bb4 100644
--- a/compiler/tfl-verify/CMakeLists.txt
+++ b/compiler/tfl-verify/CMakeLists.txt
@@ -1,6 +1,6 @@
-if(NOT TARGET mio_tflite)
+if(NOT TARGET mio_tflite280)
   return()
-endif(NOT TARGET mio_tflite)
+endif(NOT TARGET mio_tflite280)
 
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
index 6d1897607..62345494b 100644
--- a/compiler/tfl-verify/src/Driver.cpp
+++ b/compiler/tfl-verify/src/Driver.cpp
@@ -25,7 +25,7 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
+  arser.add_argument("tflite").help("TFLite file path to verify");
 
   try
   {
diff --git a/compiler/tflchef/CMakeLists.txt b/compiler/tflchef/CMakeLists.txt
index 948b1cecd..6205ac650 100644
--- a/compiler/tflchef/CMakeLists.txt
+++ b/compiler/tflchef/CMakeLists.txt
@@ -20,4 +20,9 @@ add_subdirectory(core)
 add_subdirectory(tflite)
 # Tools
 add_subdirectory(tools)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 add_subdirectory(tests)
diff --git a/compiler/tflchef/core/src/Convert.cpp b/compiler/tflchef/core/src/Convert.cpp
index 200c71eca..f4dd4b332 100644
--- a/compiler/tflchef/core/src/Convert.cpp
+++ b/compiler/tflchef/core/src/Convert.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,6 +63,8 @@ tflite::TensorType as_tflite_tensortype(const tflchef::TensorType &value)
   {
     case tflchef::FLOAT32:
       return tflite::TensorType_FLOAT32;
+    case tflchef::FLOAT16:
+      return tflite::TensorType_FLOAT16;
     case tflchef::INT32:
       return tflite::TensorType_INT32;
     case tflchef::UINT8:
@@ -164,3 +167,222 @@ as_tflite_sparse_index_vec(flatbuffers::FlatBufferBuilder &fb,
 
   throw std::runtime_error("Unknown SparseIndexVector type");
 }
+
+// namespace sparsity code referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
+
+namespace sparsity
+{
+
+template <typename T>
+FormatConverter<T>::FormatConverter(const std::vector<int> &shape,
+                                    const std::vector<int> &traversal_order,
+                                    const std::vector<TfLiteDimensionType> &format,
+                                    const std::vector<int> &block_size,
+                                    const std::vector<int> &block_map)
+  : dense_shape_(shape), traversal_order_(traversal_order), block_size_(block_size),
+    block_map_(block_map)
+{
+  dense_size_ = 1;
+  int block_dim = 0;
+  blocked_shape_.resize(shape.size());
+  format_.resize(shape.size() + block_map.size());
+  for (int i = 0; i < shape.size(); i++)
+  {
+    format_[i] = format[traversal_order[i]];
+    dense_size_ *= shape[i];
+    if (block_dim < block_map.size() && block_map[block_dim] == i)
+    {
+      blocked_shape_[i] = shape[i] / block_size[block_dim];
+      block_dim++;
+    }
+    else
+    {
+      blocked_shape_[i] = shape[i];
+    }
+  }
+
+  // Only dense blocks are supported.
+  for (int i = 0; i < block_map.size(); i++)
+  {
+    format_[i + shape.size()] = kTfLiteDimDense;
+  }
+}
+
+template <typename T> bool FormatConverter<T>::DenseToSparse(const T *src_data)
+{
+  int num_original_dims = dense_shape_.size();
+  int num_block_dims = block_map_.size();
+  int num_expanded_dims = num_original_dims + num_block_dims;
+  std::vector<int> expanded_shape(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; i++)
+  {
+    if (i < num_original_dims)
+    {
+      expanded_shape[i] = blocked_shape_[i];
+    }
+    else
+    {
+      expanded_shape[i] = block_size_[i - num_original_dims];
+    }
+  }
+
+  std::vector<int> shape_offset(num_original_dims);
+  shape_offset[shape_offset.size() - 1] = 1;
+  for (int i = num_original_dims - 1; i > 0; --i)
+  {
+    shape_offset[i - 1] = shape_offset[i] * dense_shape_[i];
+  }
+
+  std::vector<int> expanded_shape_offset(num_expanded_dims);
+  for (int i = 0; i < num_original_dims; ++i)
+  {
+    expanded_shape_offset[i] = shape_offset[i];
+  }
+  for (int i = 0; i < num_block_dims; ++i)
+  {
+    int mapped_dim = block_map_[i];
+    expanded_shape_offset[num_original_dims + i] = shape_offset[mapped_dim];
+    expanded_shape_offset[mapped_dim] *= block_size_[i];
+  }
+
+  std::vector<int> dst_ordered_offset(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; ++i)
+  {
+    dst_ordered_offset[i] = expanded_shape_offset[traversal_order_[i]];
+  }
+
+  std::vector<bool> dst_dim_has_nonzeroes(num_expanded_dims);
+  std::fill(dst_dim_has_nonzeroes.begin(), dst_dim_has_nonzeroes.end(), false);
+  std::vector<int> inner_compressed_dim(num_expanded_dims);
+  int most_recent_compressed_dim = -1;
+  std::vector<int> num_segments_of_next_compressed_dim(num_expanded_dims);
+  int segment_count = 1;
+  for (int i = num_expanded_dims - 1; i >= 0; --i)
+  {
+    inner_compressed_dim[i] = most_recent_compressed_dim;
+    if (format_[i] == kTfLiteDimSparseCSR)
+    {
+      most_recent_compressed_dim = i;
+      num_segments_of_next_compressed_dim[i] = segment_count;
+      segment_count = 1;
+    }
+    else
+    {
+      num_segments_of_next_compressed_dim[i] = -1;
+      segment_count *= expanded_shape[traversal_order_[i]];
+    }
+  }
+
+  dim_metadata_.resize(num_expanded_dims * 2);
+  std::vector<int> dst_sparse_dims;
+  dst_sparse_dims.reserve(num_expanded_dims);
+  for (int i = 0; i < num_expanded_dims; ++i)
+  {
+    dim_metadata_[i * 2].clear();
+    dim_metadata_[i * 2 + 1].clear();
+    if (format_[i] == kTfLiteDimDense)
+    {
+      // If dimension is dense, just store the shape.
+      dim_metadata_[i * 2].push_back(expanded_shape[traversal_order_[i]]);
+    }
+    else
+    {
+      dim_metadata_[i * 2].push_back(0); // Segment array always begins with 0.
+      dst_sparse_dims.push_back(i);      // Add dimension to the sparse list.
+    }
+  }
+
+  // This algorithm assumes that the block size is small enough for all the
+  // elements to fit in cache, so the strided accesses from different traversal
+  // order and the write-first-erase-later strategy shouldn't be too slow
+  int dst_dim_idx = num_expanded_dims;
+  std::vector<int> coordinate(num_expanded_dims, 0);
+  int dense_tensor_idx = 0;
+  while (dst_dim_idx >= 0)
+  {
+    if (dst_dim_idx == num_expanded_dims)
+    {
+      // We have a complete coordinate. Add the element to the value array if it
+      // is not zero, or if the last dimension is dense.
+      if (!IsZero(src_data[dense_tensor_idx]))
+      {
+        data_.push_back(src_data[dense_tensor_idx]);
+        // Mark all sparse dimensions that their current indices have nonzeroes.
+        for (auto dst_dim : dst_sparse_dims)
+        {
+          if (!dst_dim_has_nonzeroes[dst_dim])
+          {
+            // Only add the index to the indices array if the current nonzero
+            // is the first nonzero of the block.
+            dim_metadata_[2 * dst_dim + 1].push_back(coordinate[dst_dim]);
+            dst_dim_has_nonzeroes[dst_dim] = true;
+          }
+        }
+      }
+      else if (format_[num_expanded_dims - 1] == kTfLiteDimDense)
+      {
+        data_.push_back(src_data[dense_tensor_idx]);
+      }
+      --dst_dim_idx;
+    }
+    else
+    {
+      int original_dim_idx = traversal_order_[dst_dim_idx];
+      int dim_size = expanded_shape[original_dim_idx];
+      if (dst_dim_has_nonzeroes[dst_dim_idx])
+      {
+        // If the previous block has nonzeroes, reset the flag to false since
+        // we have just moved to a new block.
+        dst_dim_has_nonzeroes[dst_dim_idx] = false;
+      }
+      else if (format_[dst_dim_idx] == kTfLiteDimSparseCSR)
+      {
+        // This block is empty. Delete unnecessary values if compressed.
+        int next_compressed_dim = inner_compressed_dim[dst_dim_idx];
+        int erase_offset = dim_metadata_[2 * dst_dim_idx + 1].size() *
+                           num_segments_of_next_compressed_dim[dst_dim_idx];
+        if (next_compressed_dim >= 0)
+        {
+          auto &segments = dim_metadata_[2 * inner_compressed_dim[dst_dim_idx]];
+          segments.erase(segments.begin() + 1 + erase_offset, segments.end());
+        }
+        else
+        {
+          data_.erase(data_.begin() + erase_offset, data_.end());
+        }
+      }
+      if (++coordinate[dst_dim_idx] < dim_size)
+      {
+        // The current dst_dim_idx is valid (not out of bound).
+        dense_tensor_idx += dst_ordered_offset[dst_dim_idx];
+        ++dst_dim_idx;
+      }
+      else
+      {
+        // dst_dim_idx has reached its dim size. Update segment array and go
+        // back to incrementing the previous dimension (dst_dim_idx - 1).
+        if (format_[dst_dim_idx] == kTfLiteDimSparseCSR)
+        {
+          dim_metadata_[2 * dst_dim_idx].push_back(dim_metadata_[2 * dst_dim_idx + 1].size());
+        }
+        coordinate[dst_dim_idx] = -1;
+        dense_tensor_idx -= dst_ordered_offset[dst_dim_idx] * dim_size;
+        --dst_dim_idx;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename T> bool FormatConverter<T>::IsZero(const T val)
+{
+  return (val == static_cast<T>(0));
+}
+
+template class FormatConverter<float>;
+template class FormatConverter<uint16_t>; // float16
+
+} // namespace sparsity
diff --git a/compiler/tflchef/core/src/Convert.h b/compiler/tflchef/core/src/Convert.h
index 45c93d229..6e910ea2c 100644
--- a/compiler/tflchef/core/src/Convert.h
+++ b/compiler/tflchef/core/src/Convert.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,4 +35,52 @@ flatbuffers::Offset<void>
 as_tflite_sparse_index_vec(flatbuffers::FlatBufferBuilder &fb,
                            const ::tflchef::TensorSparsity_IndexVec &value);
 
+// codes under namespace sparsity referenced from
+// https://github.com/tensorflow/tensorflow/blob/3f878cff5b698b82eea85db2b60d65a2e320850e/
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
+//       tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
+
+namespace sparsity
+{
+
+// Storage format of each dimension in a sparse tensor.
+typedef enum TfLiteDimensionType
+{
+  kTfLiteDimDense = 0,
+  kTfLiteDimSparseCSR,
+} TfLiteDimensionType;
+
+template <typename T> class FormatConverter
+{
+public:
+  FormatConverter(const std::vector<int32_t> &shape, const std::vector<int32_t> &traversal_order,
+                  const std::vector<TfLiteDimensionType> &format,
+                  const std::vector<int32_t> &block_size = {},
+                  const std::vector<int32_t> &block_map = {});
+
+  bool DenseToSparse(const T *src_data);
+
+  const std::vector<T> &GetData() { return data_; }
+  const std::vector<std::vector<int32_t>> &GetDimMetadata() { return dim_metadata_; }
+
+private:
+  bool IsZero(const T val);
+
+private:
+  std::vector<int32_t> dense_shape_;
+  std::vector<int32_t> blocked_shape_;
+  size_t dense_size_;
+  std::vector<int32_t> traversal_order_;
+  std::vector<TfLiteDimensionType> format_;
+  std::vector<int32_t> block_size_;
+  std::vector<int32_t> block_map_;
+  std::vector<std::vector<int32_t>> dim_metadata_;
+  std::vector<T> data_;
+};
+
+extern template class FormatConverter<float>;
+extern template class FormatConverter<uint16_t>; // float16
+
+} // namespace sparsity
+
 #endif // __CONVERT_H__
diff --git a/compiler/tflchef/core/src/DataChef.def b/compiler/tflchef/core/src/DataChef.def
index c634c047e..28a5b7617 100644
--- a/compiler/tflchef/core/src/DataChef.def
+++ b/compiler/tflchef/core/src/DataChef.def
@@ -21,3 +21,7 @@ DATA_CHEF(FLOAT32, gaussian, GaussianFloat32DataChefFactory)
 DATA_CHEF(INT32, gaussian, GaussianInt32DataChefFactory)
 DATA_CHEF(INT16, gaussian, GaussianInt16DataChefFactory)
 DATA_CHEF(UINT8, gaussian, GaussianUint8DataChefFactory)
+
+// FLOAT16 support for only gaussian, explicit for now
+DATA_CHEF(FLOAT16, explicit, ExplicitFloat16DataChefFactory)
+DATA_CHEF(FLOAT16, gaussian, GaussianFloat16DataChefFactory)
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index 93b9334a6..a788adc02 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -92,6 +92,7 @@ DataChefRegistry &data_chef_registry(const tflchef::TensorType &type)
   static DataChefRegistry string;
   static DataChefRegistry boolean;
   static DataChefRegistry s16;
+  static DataChefRegistry fp16;
 
   switch (type)
   {
@@ -101,6 +102,8 @@ DataChefRegistry &data_chef_registry(const tflchef::TensorType &type)
       return s64;
     case tflchef::FLOAT32:
       return fp32;
+    case tflchef::FLOAT16:
+      return fp16;
     case tflchef::UINT8:
       return u8;
     case tflchef::STRING:
@@ -207,6 +210,41 @@ struct CookParams
   std::string noname;
 };
 
+std::vector<flatbuffers::Offset<tflite::DimensionMetadata>>
+make_dim_metadata_vec(flatbuffers::FlatBufferBuilder *flatbuffer_builder, int32_t dims_count,
+                      const std::vector<int> &traversal_order_vec,
+                      const std::vector<sparsity::TfLiteDimensionType> &format_vec,
+                      const std::vector<std::vector<int32_t>> &dim_metadata_src)
+{
+  // Build sparsity parameter.
+  std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> dim_metadata_vec(dims_count);
+  for (int32_t i = 0; i < dims_count; i++)
+  {
+    const int32_t metadata_idx = 2 * i;
+    if (format_vec[traversal_order_vec[i]] == sparsity::kTfLiteDimSparseCSR)
+    {
+      auto array_segments =
+        tflite::CreateInt32Vector(*flatbuffer_builder,
+                                  flatbuffer_builder->CreateVector(dim_metadata_src[metadata_idx]))
+          .Union();
+      auto array_indices =
+        tflite::CreateInt32Vector(
+          *flatbuffer_builder, flatbuffer_builder->CreateVector(dim_metadata_src[metadata_idx + 1]))
+          .Union();
+      dim_metadata_vec[i] =
+        tflite::CreateDimensionMetadata(*flatbuffer_builder, tflite::DimensionType_SPARSE_CSR, 0,
+                                        tflite::SparseIndexVector_Int32Vector, array_segments,
+                                        tflite::SparseIndexVector_Int32Vector, array_indices);
+    }
+    else
+    {
+      dim_metadata_vec[i] = tflite::CreateDimensionMetadata(
+        *flatbuffer_builder, tflite::DimensionType_DENSE, dim_metadata_src[metadata_idx][0]);
+    }
+  }
+  return dim_metadata_vec;
+}
+
 template <typename T> std::map<std::string, int32_t> cook_graph(const T &graph, CookParams &cp)
 {
   LOGGER(l);
@@ -271,6 +309,8 @@ template <typename T> std::map<std::string, int32_t> cook_graph(const T &graph,
 
     assert(operand.has_type());
 
+    flatbuffers::Offset<tflite::SparsityParameters> sparsity_index;
+
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape;
     std::vector<int32_t> dims;
     if (operand.has_shape())
@@ -298,16 +338,125 @@ template <typename T> std::map<std::string, int32_t> cook_graph(const T &graph,
       // Create Data
       int32_t count = (element_count(dims) > 0) ? element_count(dims) : filler.arg_size();
       auto data_vec = chef->generate(count);
-      auto data = flatbuffer_builder->CreateVector(data_vec);
 
-      // Create Buffer
-      tflite::BufferBuilder buffer_builder{*flatbuffer_builder};
-      buffer_builder.add_data(data);
-      auto buffer = buffer_builder.Finish();
+      if (operand.has_make_sparse() && operand.make_sparse())
+      {
+        assert(not operand.has_sparsity());
+        assert(operand.has_shape());
+
+        const int32_t dims_count = dims.size();
+        std::vector<int> traversal_order_vec;
+        std::vector<sparsity::TfLiteDimensionType> format_vec;
+        for (int32_t o = 0; o < dims_count; ++o)
+          traversal_order_vec.push_back(o);
+        for (int32_t o = 0; o < dims_count - 1; ++o)
+          format_vec.push_back(sparsity::kTfLiteDimDense);
+        format_vec.push_back(sparsity::kTfLiteDimSparseCSR);
+
+        if (operand.type() == tflchef::FLOAT32)
+        {
+          ::sparsity::FormatConverter<float> converter(dims, traversal_order_vec, format_vec);
+          converter.DenseToSparse(reinterpret_cast<const float *>(data_vec.data()));
+          const auto &sparse_data = converter.GetData();
+
+          std::vector<uint8_t> sparse_uint8;
+          for (int c = 0; c < sparse_data.size(); ++c)
+          {
+            const float value = sparse_data.at(c);
+            const uint8_t *arr = reinterpret_cast<const uint8_t *>(&value);
+            for (uint32_t b = 0; b < sizeof(float); ++b)
+            {
+              sparse_uint8.emplace_back(arr[b]);
+            }
+          }
+          auto data = flatbuffer_builder->CreateVector(sparse_uint8);
+
+          // Create Buffer
+          tflite::BufferBuilder buffer_builder{*flatbuffer_builder};
+          buffer_builder.add_data(data);
+          auto buffer = buffer_builder.Finish();
+
+          // Update Buffer Index & Vector
+          buffer_index = buffer_vec.size();
+          buffer_vec.emplace_back(buffer);
+
+          // save SparsityParameters
+          auto traversal_order = flatbuffer_builder->CreateVector(traversal_order_vec);
+
+          // Create block map
+          std::vector<int> block_map_vec{};
+          auto block_map = flatbuffer_builder->CreateVector(block_map_vec);
+
+          // Create dimension metadata
+          const auto &dim_metadata_src = converter.GetDimMetadata();
+          auto dim_metadata_vec =
+            make_dim_metadata_vec(flatbuffer_builder.get(), dims_count, traversal_order_vec,
+                                  format_vec, dim_metadata_src);
+          auto dim_metadata = flatbuffer_builder->CreateVector(dim_metadata_vec);
+          sparsity_index = tflite::CreateSparsityParameters(*flatbuffer_builder, traversal_order,
+                                                            block_map, dim_metadata);
+        }
+        else if (operand.type() == tflchef::FLOAT16)
+        {
+          ::sparsity::FormatConverter<uint16_t> converter(dims, traversal_order_vec, format_vec);
+          converter.DenseToSparse(reinterpret_cast<const uint16_t *>(data_vec.data()));
+          const auto &sparse_data = converter.GetData();
+
+          std::vector<uint8_t> sparse_uint8;
+          for (int c = 0; c < sparse_data.size(); ++c)
+          {
+            const uint16_t value = sparse_data.at(c);
+            const uint8_t *arr = reinterpret_cast<const uint8_t *>(&value);
+            for (uint32_t b = 0; b < sizeof(uint16_t); ++b)
+            {
+              sparse_uint8.emplace_back(arr[b]);
+            }
+          }
+          auto data = flatbuffer_builder->CreateVector(sparse_uint8);
+
+          // Create Buffer
+          tflite::BufferBuilder buffer_builder{*flatbuffer_builder};
+          buffer_builder.add_data(data);
+          auto buffer = buffer_builder.Finish();
+
+          // Update Buffer Index & Vector
+          buffer_index = buffer_vec.size();
+          buffer_vec.emplace_back(buffer);
+
+          // save SparsityParameters
+          auto traversal_order = flatbuffer_builder->CreateVector(traversal_order_vec);
+
+          // Create block map
+          std::vector<int> block_map_vec{};
+          auto block_map = flatbuffer_builder->CreateVector(block_map_vec);
+
+          // Create dimension metadata
+          const auto &dim_metadata_src = converter.GetDimMetadata();
+          auto dim_metadata_vec =
+            make_dim_metadata_vec(flatbuffer_builder.get(), dims_count, traversal_order_vec,
+                                  format_vec, dim_metadata_src);
+          auto dim_metadata = flatbuffer_builder->CreateVector(dim_metadata_vec);
+          sparsity_index = tflite::CreateSparsityParameters(*flatbuffer_builder, traversal_order,
+                                                            block_map, dim_metadata);
+        }
+        else
+        {
+          throw std::runtime_error{"NYI: unsupported operand type"};
+        }
+      }
+      else
+      {
+        auto data = flatbuffer_builder->CreateVector(data_vec);
+
+        // Create Buffer
+        tflite::BufferBuilder buffer_builder{*flatbuffer_builder};
+        buffer_builder.add_data(data);
+        auto buffer = buffer_builder.Finish();
 
-      // Update Buffer Index & Vector
-      buffer_index = buffer_vec.size();
-      buffer_vec.emplace_back(buffer);
+        // Update Buffer Index & Vector
+        buffer_index = buffer_vec.size();
+        buffer_vec.emplace_back(buffer);
+      }
     }
     else
     {
@@ -384,8 +533,6 @@ template <typename T> std::map<std::string, int32_t> cook_graph(const T &graph,
       quant_index = quant_builder.Finish();
     }
 
-    flatbuffers::Offset<tflite::SparsityParameters> sparsity_index;
-
     if (operand.has_sparsity())
     {
       const auto &sparsity = operand.sparsity();
diff --git a/compiler/tflchef/core/src/Op/Densify.cpp b/compiler/tflchef/core/src/Op/Densify.cpp
new file mode 100644
index 000000000..63c4e207a
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/Densify.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Densify.h"
+
+flatbuffers::Offset<void> DensifyChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  tflite::DensifyOptionsBuilder options_builder{fbb};
+
+  return options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef> DensifyChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new DensifyChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/Densify.h b/compiler/tflchef/core/src/Op/Densify.h
new file mode 100644
index 000000000..f6af693d9
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/Densify.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_DENSIFY_H__
+#define __OP_DENSIFY_H__
+
+#include "OpChef.h"
+
+class DensifyChef final : public OpChef
+{
+public:
+  explicit DensifyChef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override { return tflite::BuiltinOperator_DENSIFY; }
+
+  tflite::BuiltinOptions type(void) const override { return tflite::BuiltinOptions_DensifyOptions; }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct DensifyChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_DENSIFY_H__
diff --git a/compiler/tflchef/core/src/OpChef.def b/compiler/tflchef/core/src/OpChef.def
index beebd359f..c19d00dfb 100644
--- a/compiler/tflchef/core/src/OpChef.def
+++ b/compiler/tflchef/core/src/OpChef.def
@@ -18,6 +18,7 @@ OP_CHEF(Ceil, CeilChefFactory)
 OP_CHEF(Concatenation, ConcatenationChefFactory)
 OP_CHEF(Conv2D, Conv2DChefFactory)
 OP_CHEF(Cos, CosChefFactory)
+OP_CHEF(Densify, DensifyChefFactory)
 OP_CHEF(DepthToSpace, DepthToSpaceChefFactory)
 OP_CHEF(DepthwiseConv2D, DepthwiseConv2DChefFactory)
 OP_CHEF(Dequantize, DequantizeChefFactory)
diff --git a/compiler/tflchef/core/src/OpChefs.h b/compiler/tflchef/core/src/OpChefs.h
index 159019abf..3cd3be558 100644
--- a/compiler/tflchef/core/src/OpChefs.h
+++ b/compiler/tflchef/core/src/OpChefs.h
@@ -31,6 +31,7 @@
 #include "Op/Concatenation.h"
 #include "Op/Conv2D.h"
 #include "Op/Cos.h"
+#include "Op/Densify.h"
 #include "Op/DepthToSpace.h"
 #include "Op/DepthwiseConv2D.h"
 #include "Op/Dequantize.h"
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 1abefafe1..da4b6920d 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -15,6 +15,7 @@ package tflchef;
 // This enum value corresponds to TensorType in TensorFlow Lite schema
 enum TensorType {
   FLOAT32 = 0;
+  FLOAT16 = 1;
   INT32 = 2;
   UINT8 = 3;
   INT64 = 4;
@@ -88,6 +89,12 @@ message Operand {
   optional TensorSparsity sparsity = 6;
   optional bool is_variable = 7 [default = false];
   optional ShapeSignature shape_signature = 8;
+  // 'make_sparse' is to tell tflchef to make a sparse tensor
+  // as filling 'TensorSparsity' by hand can be difficult
+  // for now, last dimension will be SPARSE_CSR
+  // ex) shape [2, 3, 4] will have
+  //     TraversalOrder [0, 1, 2] with [DENSE, DENSE, SPARSE_CSR]
+  optional bool make_sparse = 9 [default = false];
 }
 
 // This enum value corresponds to Padding in TensorFlow Lite schema
@@ -534,6 +541,10 @@ message FakeQuantOptions {
   optional bool narrow_range = 4 [default = false];
 }
 
+message DensifyOptions {
+  // NONE
+}
+
 message Operation {
   optional string type = 1;
   repeated string input = 2;
@@ -650,6 +661,7 @@ message Operation {
   optional AddNOptions add_n_options = 207;
   optional MatMulOptions matmul_options = 208;
   optional MaxPoolWithArgmaxOptions max_pool_with_argmax_options = 209;
+  optional DensifyOptions densify_options = 210;
   // NOTE if there are more than two options with same type of Options
   // use the number not listed in the above reserve list
 }
diff --git a/compiler/tflchef/tests/make_sparse/test.recipe b/compiler/tflchef/tests/make_sparse/test.recipe
new file mode 100644
index 000000000..15cc93a5d
--- /dev/null
+++ b/compiler/tflchef/tests/make_sparse/test.recipe
@@ -0,0 +1,44 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "sparse"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "2" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "3"
+  }
+  make_sparse: true
+}
+operand {
+  name: "dense"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operation {
+  type: "Densify"
+  input: "sparse"
+  output: "dense"
+}
+operation {
+  type: "Add"
+  input: "in"
+  input: "dense"
+  output: "out"
+  add_options {
+    activation: NONE
+  }
+}
+input: "in"
+output: "out"
diff --git a/compiler/tflchef/tests/make_sparse_f16/test.recipe b/compiler/tflchef/tests/make_sparse_f16/test.recipe
new file mode 100644
index 000000000..5977a1d32
--- /dev/null
+++ b/compiler/tflchef/tests/make_sparse_f16/test.recipe
@@ -0,0 +1,54 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "sparse16"
+  type: FLOAT16
+  shape { dim: 4 dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "2" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "0"
+    arg: "0" arg: "0" arg: "0" arg: "3"
+  }
+  make_sparse: true
+}
+operand {
+  name: "dense16"
+  type: FLOAT16
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "dense32"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 4 dim: 4 }
+}
+operation {
+  type: "Densify"
+  input: "sparse16"
+  output: "dense16"
+}
+operation {
+  type: "Dequantize"
+  input: "dense16"
+  output: "dense32"
+}
+operation {
+  type: "Add"
+  input: "in"
+  input: "dense32"
+  output: "out"
+  add_options {
+    activation: NONE
+  }
+}
+input: "in"
+output: "out"
diff --git a/compiler/tflchef/tflite/CMakeLists.txt b/compiler/tflchef/tflite/CMakeLists.txt
index 3c3352b0a..d9a20a2e1 100644
--- a/compiler/tflchef/tflite/CMakeLists.txt
+++ b/compiler/tflchef/tflite/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_library(tflchef_tflite STATIC ${SOURCES})
 target_include_directories(tflchef_tflite PUBLIC include)
 target_include_directories(tflchef_tflite PRIVATE src)
+target_include_directories(tflchef_tflite PRIVATE src/Op/include)
 target_link_libraries(tflchef_tflite tflchef_proto)
 target_link_libraries(tflchef_tflite mio_tflite280)
 target_link_libraries(tflchef_tflite mio_tflite280_helper)
diff --git a/compiler/tflchef/tflite/src/Convert.cpp b/compiler/tflchef/tflite/src/Convert.cpp
index f47e51d3d..242987661 100644
--- a/compiler/tflchef/tflite/src/Convert.cpp
+++ b/compiler/tflchef/tflite/src/Convert.cpp
@@ -35,8 +35,9 @@ tflchef::TensorType as_tflchef_type(const tflite::TensorType type)
       return tflchef::BOOL;
     case tflite::TensorType_INT16:
       return tflchef::INT16;
+    case tflite::TensorType_FLOAT16:
+      return tflchef::FLOAT16;
     // TODO handle other types
-    // TensorType_FLOAT16
     // TensorType_STRING
     // TensorType_COMPLEX64
     default:
diff --git a/compiler/tflchef/tflite/src/FillerHelper.cpp b/compiler/tflchef/tflite/src/FillerHelper.cpp
index cf96d2e8c..1ac99ad40 100644
--- a/compiler/tflchef/tflite/src/FillerHelper.cpp
+++ b/compiler/tflchef/tflite/src/FillerHelper.cpp
@@ -48,3 +48,18 @@ void fill_tensor_to_import(int32_t idx, TFliteImport *import)
 }
 
 } // namespace tflchef
+
+// helpers of common codes for filling inputs
+namespace tflchef
+{
+
+void fill_two_inputs(const tflite::Operator *op, TFliteImport *import)
+{
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+  assert(inputs.size() == 2);
+
+  fill_tensor_to_import(inputs[0], import);
+  fill_tensor_to_import(inputs[1], import);
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/FillerHelper.h b/compiler/tflchef/tflite/src/FillerHelper.h
index 053a5c18a..e96ae73d0 100644
--- a/compiler/tflchef/tflite/src/FillerHelper.h
+++ b/compiler/tflchef/tflite/src/FillerHelper.h
@@ -28,4 +28,12 @@ void fill_tensor_to_import(int32_t idx, TFliteImport *import);
 
 } // namespace tflchef
 
+// helpers of common codes for filling inputs
+namespace tflchef
+{
+
+void fill_two_inputs(const tflite::Operator *op, TFliteImport *import);
+
+} // namespace tflchef
+
 #endif // __FILLER_HELPER_H__
diff --git a/compiler/tflchef/tflite/src/Op/Add.cpp b/compiler/tflchef/tflite/src/Op/Add.cpp
index 3e880a63b..23d360616 100644
--- a/compiler/tflchef/tflite/src/Op/Add.cpp
+++ b/compiler/tflchef/tflite/src/Op/Add.cpp
@@ -27,11 +27,7 @@ void TFliteOpAdd::filler(const tflite::Operator *op, TFliteImport *import,
 {
   // Add may have constant input
 
-  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
-  assert(inputs.size() == 2);
-
-  fill_tensor_to_import(inputs[0], import);
-  fill_tensor_to_import(inputs[1], import);
+  fill_two_inputs(op, import);
 }
 
 tflchef::Operation *TFliteOpAdd::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Maximum.cpp b/compiler/tflchef/tflite/src/Op/Maximum.cpp
index d52caf0c2..65e4c2c99 100644
--- a/compiler/tflchef/tflite/src/Op/Maximum.cpp
+++ b/compiler/tflchef/tflite/src/Op/Maximum.cpp
@@ -25,11 +25,7 @@ namespace tflchef
 void TFliteOpMaximum::filler(const tflite::Operator *op, TFliteImport *import,
                              tflchef::ModelRecipe *model_recipe) const
 {
-  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
-  assert(inputs.size() == 2);
-
-  fill_tensor_to_import(inputs[0], import);
-  fill_tensor_to_import(inputs[1], import);
+  fill_two_inputs(op, import);
 }
 
 tflchef::Operation *TFliteOpMaximum::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Minimum.cpp b/compiler/tflchef/tflite/src/Op/Minimum.cpp
index 6440f1deb..b4d255ce3 100644
--- a/compiler/tflchef/tflite/src/Op/Minimum.cpp
+++ b/compiler/tflchef/tflite/src/Op/Minimum.cpp
@@ -25,11 +25,7 @@ namespace tflchef
 void TFliteOpMinimum::filler(const tflite::Operator *op, TFliteImport *import,
                              tflchef::ModelRecipe *model_recipe) const
 {
-  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
-  assert(inputs.size() == 2);
-
-  fill_tensor_to_import(inputs[0], import);
-  fill_tensor_to_import(inputs[1], import);
+  fill_two_inputs(op, import);
 }
 
 tflchef::Operation *TFliteOpMinimum::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Mul.cpp b/compiler/tflchef/tflite/src/Op/Mul.cpp
index 9faa4acaf..1145ff7e6 100644
--- a/compiler/tflchef/tflite/src/Op/Mul.cpp
+++ b/compiler/tflchef/tflite/src/Op/Mul.cpp
@@ -27,11 +27,7 @@ void TFliteOpMul::filler(const tflite::Operator *op, TFliteImport *import,
 {
   // Mul may have constant input
 
-  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
-  assert(inputs.size() == 2);
-
-  fill_tensor_to_import(inputs[0], import);
-  fill_tensor_to_import(inputs[1], import);
+  fill_two_inputs(op, import);
 }
 
 tflchef::Operation *TFliteOpMul::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp
index ad9921970..4f096ced4 100644
--- a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp
+++ b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp
@@ -38,7 +38,7 @@ void TFliteOpNonMaxSuppressionV4::filler(const tflite::Operator *op, TFliteImpor
 
   for (int32_t index = 2; index < 5; ++index)
   {
-    fill_tensor_to_import(index, import);
+    fill_tensor_to_import(inputs[index], import);
   }
 }
 
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.cpp b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.cpp
index db7f4c932..332cba0ff 100644
--- a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.cpp
+++ b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.cpp
@@ -41,7 +41,7 @@ void TFliteOpNonMaxSuppressionV5::filler(const tflite::Operator *op, TFliteImpor
 
   for (int32_t index = 2; index < 6; ++index)
   {
-    fill_tensor_to_import(index, import);
+    fill_tensor_to_import(inputs[index], import);
   }
 }
 
diff --git a/compiler/tflchef/tflite/src/Op/PadV2.cpp b/compiler/tflchef/tflite/src/Op/PadV2.cpp
index 0b1c9f3b2..a6b657f59 100644
--- a/compiler/tflchef/tflite/src/Op/PadV2.cpp
+++ b/compiler/tflchef/tflite/src/Op/PadV2.cpp
@@ -16,6 +16,7 @@
 
 #include "PadV2.h"
 
+#include "Convert.h"
 #include "FillerHelper.h"
 
 namespace tflchef
@@ -24,9 +25,11 @@ namespace tflchef
 void TFliteOpPadV2::filler(const tflite::Operator *op, TFliteImport *import,
                            tflchef::ModelRecipe *model_recipe) const
 {
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+
   // Filler for paddings and constant_values
-  fill_tensor_to_import(1, import);
-  fill_tensor_to_import(2, import);
+  fill_tensor_to_import(inputs[1], import);
+  fill_tensor_to_import(inputs[2], import);
 }
 
 tflchef::Operation *TFliteOpPadV2::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/ScatterNd.cpp b/compiler/tflchef/tflite/src/Op/ScatterNd.cpp
index 548a09a67..ec09a69a4 100644
--- a/compiler/tflchef/tflite/src/Op/ScatterNd.cpp
+++ b/compiler/tflchef/tflite/src/Op/ScatterNd.cpp
@@ -25,9 +25,11 @@ namespace tflchef
 void TFliteOpScatterNd::filler(const tflite::Operator *op, TFliteImport *import,
                                tflchef::ModelRecipe *model_recipe) const
 {
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+
   // Filler for indices and shape
-  fill_tensor_to_import(0, import);
-  fill_tensor_to_import(2, import);
+  fill_tensor_to_import(inputs[0], import);
+  fill_tensor_to_import(inputs[2], import);
 }
 
 tflchef::Operation *TFliteOpScatterNd::build(const tflite::Operator *, TFliteImport *,
diff --git a/compiler/tflchef/tflite/src/Op/SegmentSum.cpp b/compiler/tflchef/tflite/src/Op/SegmentSum.cpp
index a975ca4b3..bc45a94e0 100644
--- a/compiler/tflchef/tflite/src/Op/SegmentSum.cpp
+++ b/compiler/tflchef/tflite/src/Op/SegmentSum.cpp
@@ -16,6 +16,7 @@
 
 #include "SegmentSum.h"
 
+#include "Convert.h"
 #include "FillerHelper.h"
 
 namespace tflchef
@@ -24,8 +25,10 @@ namespace tflchef
 void TFliteOpSegmentSum::filler(const tflite::Operator *op, TFliteImport *import,
                                 tflchef::ModelRecipe *model_recipe) const
 {
-  // Filler for indices and shape
-  fill_tensor_to_import(1, import);
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+
+  // Filler for segment_ids
+  fill_tensor_to_import(inputs[1], import);
 }
 
 tflchef::Operation *TFliteOpSegmentSum::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Sub.cpp b/compiler/tflchef/tflite/src/Op/Sub.cpp
index 0a08bbfdf..584be0ab9 100644
--- a/compiler/tflchef/tflite/src/Op/Sub.cpp
+++ b/compiler/tflchef/tflite/src/Op/Sub.cpp
@@ -27,11 +27,7 @@ void TFliteOpSub::filler(const tflite::Operator *op, TFliteImport *import,
 {
   // Sub may have constant input
 
-  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
-  assert(inputs.size() == 2);
-
-  fill_tensor_to_import(inputs[0], import);
-  fill_tensor_to_import(inputs[1], import);
+  fill_two_inputs(op, import);
 }
 
 tflchef::Operation *TFliteOpSub::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Abs.h b/compiler/tflchef/tflite/src/Op/include/Abs.h
index d99b0d593..d99b0d593 100644
--- a/compiler/tflchef/tflite/src/Op/Abs.h
+++ b/compiler/tflchef/tflite/src/Op/include/Abs.h
diff --git a/compiler/tflchef/tflite/src/Op/Add.h b/compiler/tflchef/tflite/src/Op/include/Add.h
index 49d945f8b..49d945f8b 100644
--- a/compiler/tflchef/tflite/src/Op/Add.h
+++ b/compiler/tflchef/tflite/src/Op/include/Add.h
diff --git a/compiler/tflchef/tflite/src/Op/AddN.h b/compiler/tflchef/tflite/src/Op/include/AddN.h
index 4387aa06a..4387aa06a 100644
--- a/compiler/tflchef/tflite/src/Op/AddN.h
+++ b/compiler/tflchef/tflite/src/Op/include/AddN.h
diff --git a/compiler/tflchef/tflite/src/Op/ArgMax.h b/compiler/tflchef/tflite/src/Op/include/ArgMax.h
index 30068ecf2..30068ecf2 100644
--- a/compiler/tflchef/tflite/src/Op/ArgMax.h
+++ b/compiler/tflchef/tflite/src/Op/include/ArgMax.h
diff --git a/compiler/tflchef/tflite/src/Op/ArgMin.h b/compiler/tflchef/tflite/src/Op/include/ArgMin.h
index 83c643c1a..83c643c1a 100644
--- a/compiler/tflchef/tflite/src/Op/ArgMin.h
+++ b/compiler/tflchef/tflite/src/Op/include/ArgMin.h
diff --git a/compiler/tflchef/tflite/src/Op/AveragePool2D.h b/compiler/tflchef/tflite/src/Op/include/AveragePool2D.h
index f9e9fb254..f9e9fb254 100644
--- a/compiler/tflchef/tflite/src/Op/AveragePool2D.h
+++ b/compiler/tflchef/tflite/src/Op/include/AveragePool2D.h
diff --git a/compiler/tflchef/tflite/src/Op/BatchMatMul.h b/compiler/tflchef/tflite/src/Op/include/BatchMatMul.h
index 6eb4c6e68..6eb4c6e68 100644
--- a/compiler/tflchef/tflite/src/Op/BatchMatMul.h
+++ b/compiler/tflchef/tflite/src/Op/include/BatchMatMul.h
diff --git a/compiler/tflchef/tflite/src/Op/BatchToSpaceND.h b/compiler/tflchef/tflite/src/Op/include/BatchToSpaceND.h
index ae2114c97..ae2114c97 100644
--- a/compiler/tflchef/tflite/src/Op/BatchToSpaceND.h
+++ b/compiler/tflchef/tflite/src/Op/include/BatchToSpaceND.h
diff --git a/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.h b/compiler/tflchef/tflite/src/Op/include/BidirectionalSequenceLSTM.h
index 333f542ac..333f542ac 100644
--- a/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.h
+++ b/compiler/tflchef/tflite/src/Op/include/BidirectionalSequenceLSTM.h
diff --git a/compiler/tflchef/tflite/src/Op/Cast.h b/compiler/tflchef/tflite/src/Op/include/Cast.h
index 29c126c93..29c126c93 100644
--- a/compiler/tflchef/tflite/src/Op/Cast.h
+++ b/compiler/tflchef/tflite/src/Op/include/Cast.h
diff --git a/compiler/tflchef/tflite/src/Op/Ceil.h b/compiler/tflchef/tflite/src/Op/include/Ceil.h
index 44df20778..44df20778 100644
--- a/compiler/tflchef/tflite/src/Op/Ceil.h
+++ b/compiler/tflchef/tflite/src/Op/include/Ceil.h
diff --git a/compiler/tflchef/tflite/src/Op/Concatenation.h b/compiler/tflchef/tflite/src/Op/include/Concatenation.h
index 4a7ea5791..4a7ea5791 100644
--- a/compiler/tflchef/tflite/src/Op/Concatenation.h
+++ b/compiler/tflchef/tflite/src/Op/include/Concatenation.h
diff --git a/compiler/tflchef/tflite/src/Op/Conv2D.h b/compiler/tflchef/tflite/src/Op/include/Conv2D.h
index 0216e9ce9..0216e9ce9 100644
--- a/compiler/tflchef/tflite/src/Op/Conv2D.h
+++ b/compiler/tflchef/tflite/src/Op/include/Conv2D.h
diff --git a/compiler/tflchef/tflite/src/Op/Cos.h b/compiler/tflchef/tflite/src/Op/include/Cos.h
index 8f3dbe3a6..8f3dbe3a6 100644
--- a/compiler/tflchef/tflite/src/Op/Cos.h
+++ b/compiler/tflchef/tflite/src/Op/include/Cos.h
diff --git a/compiler/tflchef/tflite/src/Op/DepthToSpace.h b/compiler/tflchef/tflite/src/Op/include/DepthToSpace.h
index b5852ac89..b5852ac89 100644
--- a/compiler/tflchef/tflite/src/Op/DepthToSpace.h
+++ b/compiler/tflchef/tflite/src/Op/include/DepthToSpace.h
diff --git a/compiler/tflchef/tflite/src/Op/DepthwiseConv2D.h b/compiler/tflchef/tflite/src/Op/include/DepthwiseConv2D.h
index c172536b4..c172536b4 100644
--- a/compiler/tflchef/tflite/src/Op/DepthwiseConv2D.h
+++ b/compiler/tflchef/tflite/src/Op/include/DepthwiseConv2D.h
diff --git a/compiler/tflchef/tflite/src/Op/Dequantize.h b/compiler/tflchef/tflite/src/Op/include/Dequantize.h
index df1c7bbdb..df1c7bbdb 100644
--- a/compiler/tflchef/tflite/src/Op/Dequantize.h
+++ b/compiler/tflchef/tflite/src/Op/include/Dequantize.h
diff --git a/compiler/tflchef/tflite/src/Op/Div.h b/compiler/tflchef/tflite/src/Op/include/Div.h
index 254a4cd99..254a4cd99 100644
--- a/compiler/tflchef/tflite/src/Op/Div.h
+++ b/compiler/tflchef/tflite/src/Op/include/Div.h
diff --git a/compiler/tflchef/tflite/src/Op/ELU.h b/compiler/tflchef/tflite/src/Op/include/ELU.h
index 490c9fde4..490c9fde4 100644
--- a/compiler/tflchef/tflite/src/Op/ELU.h
+++ b/compiler/tflchef/tflite/src/Op/include/ELU.h
diff --git a/compiler/tflchef/tflite/src/Op/Equal.h b/compiler/tflchef/tflite/src/Op/include/Equal.h
index fd4b40001..fd4b40001 100644
--- a/compiler/tflchef/tflite/src/Op/Equal.h
+++ b/compiler/tflchef/tflite/src/Op/include/Equal.h
diff --git a/compiler/tflchef/tflite/src/Op/Exp.h b/compiler/tflchef/tflite/src/Op/include/Exp.h
index 5ff3ddc8b..5ff3ddc8b 100644
--- a/compiler/tflchef/tflite/src/Op/Exp.h
+++ b/compiler/tflchef/tflite/src/Op/include/Exp.h
diff --git a/compiler/tflchef/tflite/src/Op/ExpandDims.h b/compiler/tflchef/tflite/src/Op/include/ExpandDims.h
index e2f3e4e50..e2f3e4e50 100644
--- a/compiler/tflchef/tflite/src/Op/ExpandDims.h
+++ b/compiler/tflchef/tflite/src/Op/include/ExpandDims.h
diff --git a/compiler/tflchef/tflite/src/Op/FakeQuant.h b/compiler/tflchef/tflite/src/Op/include/FakeQuant.h
index f36e615df..f36e615df 100644
--- a/compiler/tflchef/tflite/src/Op/FakeQuant.h
+++ b/compiler/tflchef/tflite/src/Op/include/FakeQuant.h
diff --git a/compiler/tflchef/tflite/src/Op/Fill.h b/compiler/tflchef/tflite/src/Op/include/Fill.h
index 4f46f628a..4f46f628a 100644
--- a/compiler/tflchef/tflite/src/Op/Fill.h
+++ b/compiler/tflchef/tflite/src/Op/include/Fill.h
diff --git a/compiler/tflchef/tflite/src/Op/Floor.h b/compiler/tflchef/tflite/src/Op/include/Floor.h
index f0f8ef38a..f0f8ef38a 100644
--- a/compiler/tflchef/tflite/src/Op/Floor.h
+++ b/compiler/tflchef/tflite/src/Op/include/Floor.h
diff --git a/compiler/tflchef/tflite/src/Op/FloorDiv.h b/compiler/tflchef/tflite/src/Op/include/FloorDiv.h
index 5d049a668..5d049a668 100644
--- a/compiler/tflchef/tflite/src/Op/FloorDiv.h
+++ b/compiler/tflchef/tflite/src/Op/include/FloorDiv.h
diff --git a/compiler/tflchef/tflite/src/Op/FloorMod.h b/compiler/tflchef/tflite/src/Op/include/FloorMod.h
index f36dfe813..f36dfe813 100644
--- a/compiler/tflchef/tflite/src/Op/FloorMod.h
+++ b/compiler/tflchef/tflite/src/Op/include/FloorMod.h
diff --git a/compiler/tflchef/tflite/src/Op/FullyConnected.h b/compiler/tflchef/tflite/src/Op/include/FullyConnected.h
index 8fbe1f3ed..8fbe1f3ed 100644
--- a/compiler/tflchef/tflite/src/Op/FullyConnected.h
+++ b/compiler/tflchef/tflite/src/Op/include/FullyConnected.h
diff --git a/compiler/tflchef/tflite/src/Op/Gather.h b/compiler/tflchef/tflite/src/Op/include/Gather.h
index e01276b76..e01276b76 100644
--- a/compiler/tflchef/tflite/src/Op/Gather.h
+++ b/compiler/tflchef/tflite/src/Op/include/Gather.h
diff --git a/compiler/tflchef/tflite/src/Op/GatherNd.h b/compiler/tflchef/tflite/src/Op/include/GatherNd.h
index 112f23d33..112f23d33 100644
--- a/compiler/tflchef/tflite/src/Op/GatherNd.h
+++ b/compiler/tflchef/tflite/src/Op/include/GatherNd.h
diff --git a/compiler/tflchef/tflite/src/Op/Greater.h b/compiler/tflchef/tflite/src/Op/include/Greater.h
index 3ab2d1a4e..3ab2d1a4e 100644
--- a/compiler/tflchef/tflite/src/Op/Greater.h
+++ b/compiler/tflchef/tflite/src/Op/include/Greater.h
diff --git a/compiler/tflchef/tflite/src/Op/GreaterEqual.h b/compiler/tflchef/tflite/src/Op/include/GreaterEqual.h
index 96b0af78a..96b0af78a 100644
--- a/compiler/tflchef/tflite/src/Op/GreaterEqual.h
+++ b/compiler/tflchef/tflite/src/Op/include/GreaterEqual.h
diff --git a/compiler/tflchef/tflite/src/Op/L2Normalize.h b/compiler/tflchef/tflite/src/Op/include/L2Normalize.h
index a73eae6c8..a73eae6c8 100644
--- a/compiler/tflchef/tflite/src/Op/L2Normalize.h
+++ b/compiler/tflchef/tflite/src/Op/include/L2Normalize.h
diff --git a/compiler/tflchef/tflite/src/Op/L2Pool2D.h b/compiler/tflchef/tflite/src/Op/include/L2Pool2D.h
index 046353440..046353440 100644
--- a/compiler/tflchef/tflite/src/Op/L2Pool2D.h
+++ b/compiler/tflchef/tflite/src/Op/include/L2Pool2D.h
diff --git a/compiler/tflchef/tflite/src/Op/LeakyRelu.h b/compiler/tflchef/tflite/src/Op/include/LeakyRelu.h
index 28e63e0ca..28e63e0ca 100644
--- a/compiler/tflchef/tflite/src/Op/LeakyRelu.h
+++ b/compiler/tflchef/tflite/src/Op/include/LeakyRelu.h
diff --git a/compiler/tflchef/tflite/src/Op/Less.h b/compiler/tflchef/tflite/src/Op/include/Less.h
index 1316cb613..1316cb613 100644
--- a/compiler/tflchef/tflite/src/Op/Less.h
+++ b/compiler/tflchef/tflite/src/Op/include/Less.h
diff --git a/compiler/tflchef/tflite/src/Op/LessEqual.h b/compiler/tflchef/tflite/src/Op/include/LessEqual.h
index 81c710fbc..81c710fbc 100644
--- a/compiler/tflchef/tflite/src/Op/LessEqual.h
+++ b/compiler/tflchef/tflite/src/Op/include/LessEqual.h
diff --git a/compiler/tflchef/tflite/src/Op/LocalResponseNormalization.h b/compiler/tflchef/tflite/src/Op/include/LocalResponseNormalization.h
index c0eb3f2b1..c0eb3f2b1 100644
--- a/compiler/tflchef/tflite/src/Op/LocalResponseNormalization.h
+++ b/compiler/tflchef/tflite/src/Op/include/LocalResponseNormalization.h
diff --git a/compiler/tflchef/tflite/src/Op/Log.h b/compiler/tflchef/tflite/src/Op/include/Log.h
index 9d17e2f81..9d17e2f81 100644
--- a/compiler/tflchef/tflite/src/Op/Log.h
+++ b/compiler/tflchef/tflite/src/Op/include/Log.h
diff --git a/compiler/tflchef/tflite/src/Op/LogSoftmax.h b/compiler/tflchef/tflite/src/Op/include/LogSoftmax.h
index efd81f3e9..efd81f3e9 100644
--- a/compiler/tflchef/tflite/src/Op/LogSoftmax.h
+++ b/compiler/tflchef/tflite/src/Op/include/LogSoftmax.h
diff --git a/compiler/tflchef/tflite/src/Op/LogicalAnd.h b/compiler/tflchef/tflite/src/Op/include/LogicalAnd.h
index 1f7a964b9..1f7a964b9 100644
--- a/compiler/tflchef/tflite/src/Op/LogicalAnd.h
+++ b/compiler/tflchef/tflite/src/Op/include/LogicalAnd.h
diff --git a/compiler/tflchef/tflite/src/Op/LogicalNot.h b/compiler/tflchef/tflite/src/Op/include/LogicalNot.h
index b75d33554..b75d33554 100644
--- a/compiler/tflchef/tflite/src/Op/LogicalNot.h
+++ b/compiler/tflchef/tflite/src/Op/include/LogicalNot.h
diff --git a/compiler/tflchef/tflite/src/Op/LogicalOr.h b/compiler/tflchef/tflite/src/Op/include/LogicalOr.h
index 5331a0d65..5331a0d65 100644
--- a/compiler/tflchef/tflite/src/Op/LogicalOr.h
+++ b/compiler/tflchef/tflite/src/Op/include/LogicalOr.h
diff --git a/compiler/tflchef/tflite/src/Op/Logistic.h b/compiler/tflchef/tflite/src/Op/include/Logistic.h
index a75bf490e..a75bf490e 100644
--- a/compiler/tflchef/tflite/src/Op/Logistic.h
+++ b/compiler/tflchef/tflite/src/Op/include/Logistic.h
diff --git a/compiler/tflchef/tflite/src/Op/MatrixDiag.h b/compiler/tflchef/tflite/src/Op/include/MatrixDiag.h
index 4074f2c36..4074f2c36 100644
--- a/compiler/tflchef/tflite/src/Op/MatrixDiag.h
+++ b/compiler/tflchef/tflite/src/Op/include/MatrixDiag.h
diff --git a/compiler/tflchef/tflite/src/Op/MatrixSetDiag.h b/compiler/tflchef/tflite/src/Op/include/MatrixSetDiag.h
index 0e7ec7f32..0e7ec7f32 100644
--- a/compiler/tflchef/tflite/src/Op/MatrixSetDiag.h
+++ b/compiler/tflchef/tflite/src/Op/include/MatrixSetDiag.h
diff --git a/compiler/tflchef/tflite/src/Op/MaxPool2D.h b/compiler/tflchef/tflite/src/Op/include/MaxPool2D.h
index 36533f80c..36533f80c 100644
--- a/compiler/tflchef/tflite/src/Op/MaxPool2D.h
+++ b/compiler/tflchef/tflite/src/Op/include/MaxPool2D.h
diff --git a/compiler/tflchef/tflite/src/Op/Maximum.h b/compiler/tflchef/tflite/src/Op/include/Maximum.h
index acafec343..acafec343 100644
--- a/compiler/tflchef/tflite/src/Op/Maximum.h
+++ b/compiler/tflchef/tflite/src/Op/include/Maximum.h
diff --git a/compiler/tflchef/tflite/src/Op/Mean.h b/compiler/tflchef/tflite/src/Op/include/Mean.h
index 532c40c66..532c40c66 100644
--- a/compiler/tflchef/tflite/src/Op/Mean.h
+++ b/compiler/tflchef/tflite/src/Op/include/Mean.h
diff --git a/compiler/tflchef/tflite/src/Op/Minimum.h b/compiler/tflchef/tflite/src/Op/include/Minimum.h
index 5db5b7940..5db5b7940 100644
--- a/compiler/tflchef/tflite/src/Op/Minimum.h
+++ b/compiler/tflchef/tflite/src/Op/include/Minimum.h
diff --git a/compiler/tflchef/tflite/src/Op/MirrorPad.h b/compiler/tflchef/tflite/src/Op/include/MirrorPad.h
index c9acdd498..c9acdd498 100644
--- a/compiler/tflchef/tflite/src/Op/MirrorPad.h
+++ b/compiler/tflchef/tflite/src/Op/include/MirrorPad.h
diff --git a/compiler/tflchef/tflite/src/Op/Mul.h b/compiler/tflchef/tflite/src/Op/include/Mul.h
index fd009d2fd..fd009d2fd 100644
--- a/compiler/tflchef/tflite/src/Op/Mul.h
+++ b/compiler/tflchef/tflite/src/Op/include/Mul.h
diff --git a/compiler/tflchef/tflite/src/Op/Neg.h b/compiler/tflchef/tflite/src/Op/include/Neg.h
index c77ab7e84..c77ab7e84 100644
--- a/compiler/tflchef/tflite/src/Op/Neg.h
+++ b/compiler/tflchef/tflite/src/Op/include/Neg.h
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.h b/compiler/tflchef/tflite/src/Op/include/NonMaxSuppressionV4.h
index 114a2ad2f..114a2ad2f 100644
--- a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.h
+++ b/compiler/tflchef/tflite/src/Op/include/NonMaxSuppressionV4.h
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.h b/compiler/tflchef/tflite/src/Op/include/NonMaxSuppressionV5.h
index c948043f4..c948043f4 100644
--- a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV5.h
+++ b/compiler/tflchef/tflite/src/Op/include/NonMaxSuppressionV5.h
diff --git a/compiler/tflchef/tflite/src/Op/NotEqual.h b/compiler/tflchef/tflite/src/Op/include/NotEqual.h
index b1febdcc5..b1febdcc5 100644
--- a/compiler/tflchef/tflite/src/Op/NotEqual.h
+++ b/compiler/tflchef/tflite/src/Op/include/NotEqual.h
diff --git a/compiler/tflchef/tflite/src/Op/OneHot.h b/compiler/tflchef/tflite/src/Op/include/OneHot.h
index 50bbed095..50bbed095 100644
--- a/compiler/tflchef/tflite/src/Op/OneHot.h
+++ b/compiler/tflchef/tflite/src/Op/include/OneHot.h
diff --git a/compiler/tflchef/tflite/src/Op/PRelu.h b/compiler/tflchef/tflite/src/Op/include/PRelu.h
index b35c6e7ce..b35c6e7ce 100644
--- a/compiler/tflchef/tflite/src/Op/PRelu.h
+++ b/compiler/tflchef/tflite/src/Op/include/PRelu.h
diff --git a/compiler/tflchef/tflite/src/Op/Pack.h b/compiler/tflchef/tflite/src/Op/include/Pack.h
index 7779f64ed..7779f64ed 100644
--- a/compiler/tflchef/tflite/src/Op/Pack.h
+++ b/compiler/tflchef/tflite/src/Op/include/Pack.h
diff --git a/compiler/tflchef/tflite/src/Op/Pad.h b/compiler/tflchef/tflite/src/Op/include/Pad.h
index 99998d418..99998d418 100644
--- a/compiler/tflchef/tflite/src/Op/Pad.h
+++ b/compiler/tflchef/tflite/src/Op/include/Pad.h
diff --git a/compiler/tflchef/tflite/src/Op/PadV2.h b/compiler/tflchef/tflite/src/Op/include/PadV2.h
index 3aa474b92..3aa474b92 100644
--- a/compiler/tflchef/tflite/src/Op/PadV2.h
+++ b/compiler/tflchef/tflite/src/Op/include/PadV2.h
diff --git a/compiler/tflchef/tflite/src/Op/Pow.h b/compiler/tflchef/tflite/src/Op/include/Pow.h
index 20e847377..20e847377 100644
--- a/compiler/tflchef/tflite/src/Op/Pow.h
+++ b/compiler/tflchef/tflite/src/Op/include/Pow.h
diff --git a/compiler/tflchef/tflite/src/Op/Quantize.h b/compiler/tflchef/tflite/src/Op/include/Quantize.h
index 256ed5a5c..256ed5a5c 100644
--- a/compiler/tflchef/tflite/src/Op/Quantize.h
+++ b/compiler/tflchef/tflite/src/Op/include/Quantize.h
diff --git a/compiler/tflchef/tflite/src/Op/Range.h b/compiler/tflchef/tflite/src/Op/include/Range.h
index ad10dc58b..ad10dc58b 100644
--- a/compiler/tflchef/tflite/src/Op/Range.h
+++ b/compiler/tflchef/tflite/src/Op/include/Range.h
diff --git a/compiler/tflchef/tflite/src/Op/Rank.h b/compiler/tflchef/tflite/src/Op/include/Rank.h
index 003d9d310..003d9d310 100644
--- a/compiler/tflchef/tflite/src/Op/Rank.h
+++ b/compiler/tflchef/tflite/src/Op/include/Rank.h
diff --git a/compiler/tflchef/tflite/src/Op/ReLU.h b/compiler/tflchef/tflite/src/Op/include/ReLU.h
index be1090270..be1090270 100644
--- a/compiler/tflchef/tflite/src/Op/ReLU.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReLU.h
diff --git a/compiler/tflchef/tflite/src/Op/ReLU6.h b/compiler/tflchef/tflite/src/Op/include/ReLU6.h
index 64ddb6a2e..64ddb6a2e 100644
--- a/compiler/tflchef/tflite/src/Op/ReLU6.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReLU6.h
diff --git a/compiler/tflchef/tflite/src/Op/ReLUN1To1.h b/compiler/tflchef/tflite/src/Op/include/ReLUN1To1.h
index 0767006af..0767006af 100644
--- a/compiler/tflchef/tflite/src/Op/ReLUN1To1.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReLUN1To1.h
diff --git a/compiler/tflchef/tflite/src/Op/ReduceAny.h b/compiler/tflchef/tflite/src/Op/include/ReduceAny.h
index dd5e361d5..dd5e361d5 100644
--- a/compiler/tflchef/tflite/src/Op/ReduceAny.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReduceAny.h
diff --git a/compiler/tflchef/tflite/src/Op/ReduceMax.h b/compiler/tflchef/tflite/src/Op/include/ReduceMax.h
index 8e65cf47c..8e65cf47c 100644
--- a/compiler/tflchef/tflite/src/Op/ReduceMax.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReduceMax.h
diff --git a/compiler/tflchef/tflite/src/Op/ReduceMin.h b/compiler/tflchef/tflite/src/Op/include/ReduceMin.h
index 88cba6fe7..88cba6fe7 100644
--- a/compiler/tflchef/tflite/src/Op/ReduceMin.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReduceMin.h
diff --git a/compiler/tflchef/tflite/src/Op/ReduceProd.h b/compiler/tflchef/tflite/src/Op/include/ReduceProd.h
index e7766840a..e7766840a 100644
--- a/compiler/tflchef/tflite/src/Op/ReduceProd.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReduceProd.h
diff --git a/compiler/tflchef/tflite/src/Op/Reshape.h b/compiler/tflchef/tflite/src/Op/include/Reshape.h
index be9fdac08..be9fdac08 100644
--- a/compiler/tflchef/tflite/src/Op/Reshape.h
+++ b/compiler/tflchef/tflite/src/Op/include/Reshape.h
diff --git a/compiler/tflchef/tflite/src/Op/ResizeBilinear.h b/compiler/tflchef/tflite/src/Op/include/ResizeBilinear.h
index 98c49c534..98c49c534 100644
--- a/compiler/tflchef/tflite/src/Op/ResizeBilinear.h
+++ b/compiler/tflchef/tflite/src/Op/include/ResizeBilinear.h
diff --git a/compiler/tflchef/tflite/src/Op/ResizeNearestNeighbor.h b/compiler/tflchef/tflite/src/Op/include/ResizeNearestNeighbor.h
index 5090bb938..5090bb938 100644
--- a/compiler/tflchef/tflite/src/Op/ResizeNearestNeighbor.h
+++ b/compiler/tflchef/tflite/src/Op/include/ResizeNearestNeighbor.h
diff --git a/compiler/tflchef/tflite/src/Op/ReverseSequence.h b/compiler/tflchef/tflite/src/Op/include/ReverseSequence.h
index 8c8c811e4..8c8c811e4 100644
--- a/compiler/tflchef/tflite/src/Op/ReverseSequence.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReverseSequence.h
diff --git a/compiler/tflchef/tflite/src/Op/ReverseV2.h b/compiler/tflchef/tflite/src/Op/include/ReverseV2.h
index 6a8a75e6b..6a8a75e6b 100644
--- a/compiler/tflchef/tflite/src/Op/ReverseV2.h
+++ b/compiler/tflchef/tflite/src/Op/include/ReverseV2.h
diff --git a/compiler/tflchef/tflite/src/Op/Round.h b/compiler/tflchef/tflite/src/Op/include/Round.h
index df0da3fa1..df0da3fa1 100644
--- a/compiler/tflchef/tflite/src/Op/Round.h
+++ b/compiler/tflchef/tflite/src/Op/include/Round.h
diff --git a/compiler/tflchef/tflite/src/Op/Rsqrt.h b/compiler/tflchef/tflite/src/Op/include/Rsqrt.h
index 5d68344c2..5d68344c2 100644
--- a/compiler/tflchef/tflite/src/Op/Rsqrt.h
+++ b/compiler/tflchef/tflite/src/Op/include/Rsqrt.h
diff --git a/compiler/tflchef/tflite/src/Op/SVDF.h b/compiler/tflchef/tflite/src/Op/include/SVDF.h
index a59ca54a2..a59ca54a2 100644
--- a/compiler/tflchef/tflite/src/Op/SVDF.h
+++ b/compiler/tflchef/tflite/src/Op/include/SVDF.h
diff --git a/compiler/tflchef/tflite/src/Op/ScatterNd.h b/compiler/tflchef/tflite/src/Op/include/ScatterNd.h
index 76362d775..76362d775 100644
--- a/compiler/tflchef/tflite/src/Op/ScatterNd.h
+++ b/compiler/tflchef/tflite/src/Op/include/ScatterNd.h
diff --git a/compiler/tflchef/tflite/src/Op/SegmentSum.h b/compiler/tflchef/tflite/src/Op/include/SegmentSum.h
index d20e63bd7..d20e63bd7 100644
--- a/compiler/tflchef/tflite/src/Op/SegmentSum.h
+++ b/compiler/tflchef/tflite/src/Op/include/SegmentSum.h
diff --git a/compiler/tflchef/tflite/src/Op/Select.h b/compiler/tflchef/tflite/src/Op/include/Select.h
index bf8e57d78..bf8e57d78 100644
--- a/compiler/tflchef/tflite/src/Op/Select.h
+++ b/compiler/tflchef/tflite/src/Op/include/Select.h
diff --git a/compiler/tflchef/tflite/src/Op/SelectV2.h b/compiler/tflchef/tflite/src/Op/include/SelectV2.h
index ff03341d7..ff03341d7 100644
--- a/compiler/tflchef/tflite/src/Op/SelectV2.h
+++ b/compiler/tflchef/tflite/src/Op/include/SelectV2.h
diff --git a/compiler/tflchef/tflite/src/Op/Shape.h b/compiler/tflchef/tflite/src/Op/include/Shape.h
index ebe1befb3..ebe1befb3 100644
--- a/compiler/tflchef/tflite/src/Op/Shape.h
+++ b/compiler/tflchef/tflite/src/Op/include/Shape.h
diff --git a/compiler/tflchef/tflite/src/Op/Sin.h b/compiler/tflchef/tflite/src/Op/include/Sin.h
index 51eabceb5..51eabceb5 100644
--- a/compiler/tflchef/tflite/src/Op/Sin.h
+++ b/compiler/tflchef/tflite/src/Op/include/Sin.h
diff --git a/compiler/tflchef/tflite/src/Op/Slice.h b/compiler/tflchef/tflite/src/Op/include/Slice.h
index 6ca6724d3..6ca6724d3 100644
--- a/compiler/tflchef/tflite/src/Op/Slice.h
+++ b/compiler/tflchef/tflite/src/Op/include/Slice.h
diff --git a/compiler/tflchef/tflite/src/Op/Softmax.h b/compiler/tflchef/tflite/src/Op/include/Softmax.h
index cf168bdd9..cf168bdd9 100644
--- a/compiler/tflchef/tflite/src/Op/Softmax.h
+++ b/compiler/tflchef/tflite/src/Op/include/Softmax.h
diff --git a/compiler/tflchef/tflite/src/Op/SpaceToBatchND.h b/compiler/tflchef/tflite/src/Op/include/SpaceToBatchND.h
index 9d7bc44e8..9d7bc44e8 100644
--- a/compiler/tflchef/tflite/src/Op/SpaceToBatchND.h
+++ b/compiler/tflchef/tflite/src/Op/include/SpaceToBatchND.h
diff --git a/compiler/tflchef/tflite/src/Op/SpaceToDepth.h b/compiler/tflchef/tflite/src/Op/include/SpaceToDepth.h
index 784ad940a..784ad940a 100644
--- a/compiler/tflchef/tflite/src/Op/SpaceToDepth.h
+++ b/compiler/tflchef/tflite/src/Op/include/SpaceToDepth.h
diff --git a/compiler/tflchef/tflite/src/Op/SparseToDense.h b/compiler/tflchef/tflite/src/Op/include/SparseToDense.h
index 5ffe4789d..5ffe4789d 100644
--- a/compiler/tflchef/tflite/src/Op/SparseToDense.h
+++ b/compiler/tflchef/tflite/src/Op/include/SparseToDense.h
diff --git a/compiler/tflchef/tflite/src/Op/Split.h b/compiler/tflchef/tflite/src/Op/include/Split.h
index af247a1b9..af247a1b9 100644
--- a/compiler/tflchef/tflite/src/Op/Split.h
+++ b/compiler/tflchef/tflite/src/Op/include/Split.h
diff --git a/compiler/tflchef/tflite/src/Op/SplitV.h b/compiler/tflchef/tflite/src/Op/include/SplitV.h
index 3f715b5f9..3f715b5f9 100644
--- a/compiler/tflchef/tflite/src/Op/SplitV.h
+++ b/compiler/tflchef/tflite/src/Op/include/SplitV.h
diff --git a/compiler/tflchef/tflite/src/Op/Sqrt.h b/compiler/tflchef/tflite/src/Op/include/Sqrt.h
index 9f0ad04ae..9f0ad04ae 100644
--- a/compiler/tflchef/tflite/src/Op/Sqrt.h
+++ b/compiler/tflchef/tflite/src/Op/include/Sqrt.h
diff --git a/compiler/tflchef/tflite/src/Op/Square.h b/compiler/tflchef/tflite/src/Op/include/Square.h
index 9c008fe52..9c008fe52 100644
--- a/compiler/tflchef/tflite/src/Op/Square.h
+++ b/compiler/tflchef/tflite/src/Op/include/Square.h
diff --git a/compiler/tflchef/tflite/src/Op/SquaredDifference.h b/compiler/tflchef/tflite/src/Op/include/SquaredDifference.h
index 58c2ed460..58c2ed460 100644
--- a/compiler/tflchef/tflite/src/Op/SquaredDifference.h
+++ b/compiler/tflchef/tflite/src/Op/include/SquaredDifference.h
diff --git a/compiler/tflchef/tflite/src/Op/Squeeze.h b/compiler/tflchef/tflite/src/Op/include/Squeeze.h
index b6c89f73d..b6c89f73d 100644
--- a/compiler/tflchef/tflite/src/Op/Squeeze.h
+++ b/compiler/tflchef/tflite/src/Op/include/Squeeze.h
diff --git a/compiler/tflchef/tflite/src/Op/StridedSlice.h b/compiler/tflchef/tflite/src/Op/include/StridedSlice.h
index 98054b9b9..98054b9b9 100644
--- a/compiler/tflchef/tflite/src/Op/StridedSlice.h
+++ b/compiler/tflchef/tflite/src/Op/include/StridedSlice.h
diff --git a/compiler/tflchef/tflite/src/Op/Sub.h b/compiler/tflchef/tflite/src/Op/include/Sub.h
index 2168e5e0d..2168e5e0d 100644
--- a/compiler/tflchef/tflite/src/Op/Sub.h
+++ b/compiler/tflchef/tflite/src/Op/include/Sub.h
diff --git a/compiler/tflchef/tflite/src/Op/Sum.h b/compiler/tflchef/tflite/src/Op/include/Sum.h
index 38eeb080d..38eeb080d 100644
--- a/compiler/tflchef/tflite/src/Op/Sum.h
+++ b/compiler/tflchef/tflite/src/Op/include/Sum.h
diff --git a/compiler/tflchef/tflite/src/Op/Tanh.h b/compiler/tflchef/tflite/src/Op/include/Tanh.h
index 7339e4103..7339e4103 100644
--- a/compiler/tflchef/tflite/src/Op/Tanh.h
+++ b/compiler/tflchef/tflite/src/Op/include/Tanh.h
diff --git a/compiler/tflchef/tflite/src/Op/Tile.h b/compiler/tflchef/tflite/src/Op/include/Tile.h
index 640f52a1f..640f52a1f 100644
--- a/compiler/tflchef/tflite/src/Op/Tile.h
+++ b/compiler/tflchef/tflite/src/Op/include/Tile.h
diff --git a/compiler/tflchef/tflite/src/Op/TopKV2.h b/compiler/tflchef/tflite/src/Op/include/TopKV2.h
index b2b74cc75..b2b74cc75 100644
--- a/compiler/tflchef/tflite/src/Op/TopKV2.h
+++ b/compiler/tflchef/tflite/src/Op/include/TopKV2.h
diff --git a/compiler/tflchef/tflite/src/Op/Transpose.h b/compiler/tflchef/tflite/src/Op/include/Transpose.h
index f0d944b6b..f0d944b6b 100644
--- a/compiler/tflchef/tflite/src/Op/Transpose.h
+++ b/compiler/tflchef/tflite/src/Op/include/Transpose.h
diff --git a/compiler/tflchef/tflite/src/Op/TransposeConv.h b/compiler/tflchef/tflite/src/Op/include/TransposeConv.h
index c79cdabd2..c79cdabd2 100644
--- a/compiler/tflchef/tflite/src/Op/TransposeConv.h
+++ b/compiler/tflchef/tflite/src/Op/include/TransposeConv.h
diff --git a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.h b/compiler/tflchef/tflite/src/Op/include/UnidirectionalSequenceLSTM.h
index cc4e5fb0f..cc4e5fb0f 100644
--- a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.h
+++ b/compiler/tflchef/tflite/src/Op/include/UnidirectionalSequenceLSTM.h
diff --git a/compiler/tflchef/tflite/src/Op/Unique.h b/compiler/tflchef/tflite/src/Op/include/Unique.h
index fae037c9f..fae037c9f 100644
--- a/compiler/tflchef/tflite/src/Op/Unique.h
+++ b/compiler/tflchef/tflite/src/Op/include/Unique.h
diff --git a/compiler/tflchef/tflite/src/Op/Unpack.h b/compiler/tflchef/tflite/src/Op/include/Unpack.h
index 1036bdc14..1036bdc14 100644
--- a/compiler/tflchef/tflite/src/Op/Unpack.h
+++ b/compiler/tflchef/tflite/src/Op/include/Unpack.h
diff --git a/compiler/tflchef/tflite/src/Op/Where.h b/compiler/tflchef/tflite/src/Op/include/Where.h
index 00cdc4b00..00cdc4b00 100644
--- a/compiler/tflchef/tflite/src/Op/Where.h
+++ b/compiler/tflchef/tflite/src/Op/include/Where.h
diff --git a/compiler/tflchef/tflite/src/Op/ZerosLike.h b/compiler/tflchef/tflite/src/Op/include/ZerosLike.h
index 163c1fa21..163c1fa21 100644
--- a/compiler/tflchef/tflite/src/Op/ZerosLike.h
+++ b/compiler/tflchef/tflite/src/Op/include/ZerosLike.h
diff --git a/compiler/tflchef/tflite/src/TFliteOpChefs.h b/compiler/tflchef/tflite/src/TFliteOpChefs.h
index b38b35a61..1b9d420e5 100644
--- a/compiler/tflchef/tflite/src/TFliteOpChefs.h
+++ b/compiler/tflchef/tflite/src/TFliteOpChefs.h
@@ -18,115 +18,115 @@
 #define __TFLITE_OP_CHEFS_H__
 
 // In alphabet order
-#include "Op/Abs.h"
-#include "Op/Add.h"
-#include "Op/AddN.h"
-#include "Op/ArgMax.h"
-#include "Op/ArgMin.h"
-#include "Op/AveragePool2D.h"
-#include "Op/BatchMatMul.h"
-#include "Op/BatchToSpaceND.h"
-#include "Op/BidirectionalSequenceLSTM.h"
-#include "Op/Cast.h"
-#include "Op/Ceil.h"
-#include "Op/Concatenation.h"
-#include "Op/Conv2D.h"
-#include "Op/Cos.h"
-#include "Op/DepthToSpace.h"
-#include "Op/DepthwiseConv2D.h"
-#include "Op/Dequantize.h"
-#include "Op/Div.h"
-#include "Op/ELU.h"
-#include "Op/Equal.h"
-#include "Op/Exp.h"
-#include "Op/ExpandDims.h"
-#include "Op/FakeQuant.h"
-#include "Op/Fill.h"
-#include "Op/Floor.h"
-#include "Op/FloorDiv.h"
-#include "Op/FloorMod.h"
-#include "Op/FullyConnected.h"
-#include "Op/Gather.h"
-#include "Op/GatherNd.h"
-#include "Op/Greater.h"
-#include "Op/GreaterEqual.h"
-#include "Op/L2Normalize.h"
-#include "Op/L2Pool2D.h"
-#include "Op/LeakyRelu.h"
-#include "Op/Less.h"
-#include "Op/LessEqual.h"
-#include "Op/LocalResponseNormalization.h"
-#include "Op/Log.h"
-#include "Op/LogicalAnd.h"
-#include "Op/LogicalNot.h"
-#include "Op/LogicalOr.h"
-#include "Op/Logistic.h"
-#include "Op/LogSoftmax.h"
-#include "Op/MatrixDiag.h"
-#include "Op/MatrixSetDiag.h"
-#include "Op/Maximum.h"
-#include "Op/MaxPool2D.h"
-#include "Op/Mean.h"
-#include "Op/Minimum.h"
-#include "Op/MirrorPad.h"
-#include "Op/Mul.h"
-#include "Op/Neg.h"
-#include "Op/NonMaxSuppressionV4.h"
-#include "Op/NonMaxSuppressionV5.h"
-#include "Op/NotEqual.h"
-#include "Op/OneHot.h"
-#include "Op/Pack.h"
-#include "Op/Pad.h"
-#include "Op/PadV2.h"
-#include "Op/Pow.h"
-#include "Op/PRelu.h"
-#include "Op/Quantize.h"
-#include "Op/Range.h"
-#include "Op/Rank.h"
-#include "Op/ReduceAny.h"
-#include "Op/ReduceMax.h"
-#include "Op/ReduceMin.h"
-#include "Op/ReduceProd.h"
-#include "Op/ReLU.h"
-#include "Op/ReLU6.h"
-#include "Op/ReLUN1To1.h"
-#include "Op/Reshape.h"
-#include "Op/ResizeBilinear.h"
-#include "Op/ResizeNearestNeighbor.h"
-#include "Op/ReverseSequence.h"
-#include "Op/ReverseV2.h"
-#include "Op/Round.h"
-#include "Op/Rsqrt.h"
-#include "Op/ScatterNd.h"
-#include "Op/SegmentSum.h"
-#include "Op/Select.h"
-#include "Op/SelectV2.h"
-#include "Op/Shape.h"
-#include "Op/Sin.h"
-#include "Op/Slice.h"
-#include "Op/Softmax.h"
-#include "Op/SpaceToBatchND.h"
-#include "Op/SpaceToDepth.h"
-#include "Op/SparseToDense.h"
-#include "Op/Split.h"
-#include "Op/SplitV.h"
-#include "Op/Sqrt.h"
-#include "Op/Square.h"
-#include "Op/SquaredDifference.h"
-#include "Op/Squeeze.h"
-#include "Op/StridedSlice.h"
-#include "Op/Sub.h"
-#include "Op/Sum.h"
-#include "Op/SVDF.h"
-#include "Op/Tanh.h"
-#include "Op/Tile.h"
-#include "Op/TopKV2.h"
-#include "Op/Transpose.h"
-#include "Op/TransposeConv.h"
-#include "Op/UnidirectionalSequenceLSTM.h"
-#include "Op/Unique.h"
-#include "Op/Unpack.h"
-#include "Op/Where.h"
-#include "Op/ZerosLike.h"
+#include "Op/include/Abs.h"
+#include "Op/include/Add.h"
+#include "Op/include/AddN.h"
+#include "Op/include/ArgMax.h"
+#include "Op/include/ArgMin.h"
+#include "Op/include/AveragePool2D.h"
+#include "Op/include/BatchMatMul.h"
+#include "Op/include/BatchToSpaceND.h"
+#include "Op/include/BidirectionalSequenceLSTM.h"
+#include "Op/include/Cast.h"
+#include "Op/include/Ceil.h"
+#include "Op/include/Concatenation.h"
+#include "Op/include/Conv2D.h"
+#include "Op/include/Cos.h"
+#include "Op/include/DepthToSpace.h"
+#include "Op/include/DepthwiseConv2D.h"
+#include "Op/include/Dequantize.h"
+#include "Op/include/Div.h"
+#include "Op/include/ELU.h"
+#include "Op/include/Equal.h"
+#include "Op/include/Exp.h"
+#include "Op/include/ExpandDims.h"
+#include "Op/include/FakeQuant.h"
+#include "Op/include/Fill.h"
+#include "Op/include/Floor.h"
+#include "Op/include/FloorDiv.h"
+#include "Op/include/FloorMod.h"
+#include "Op/include/FullyConnected.h"
+#include "Op/include/Gather.h"
+#include "Op/include/GatherNd.h"
+#include "Op/include/Greater.h"
+#include "Op/include/GreaterEqual.h"
+#include "Op/include/L2Normalize.h"
+#include "Op/include/L2Pool2D.h"
+#include "Op/include/LeakyRelu.h"
+#include "Op/include/Less.h"
+#include "Op/include/LessEqual.h"
+#include "Op/include/LocalResponseNormalization.h"
+#include "Op/include/Log.h"
+#include "Op/include/LogicalAnd.h"
+#include "Op/include/LogicalNot.h"
+#include "Op/include/LogicalOr.h"
+#include "Op/include/Logistic.h"
+#include "Op/include/LogSoftmax.h"
+#include "Op/include/MatrixDiag.h"
+#include "Op/include/MatrixSetDiag.h"
+#include "Op/include/Maximum.h"
+#include "Op/include/MaxPool2D.h"
+#include "Op/include/Mean.h"
+#include "Op/include/Minimum.h"
+#include "Op/include/MirrorPad.h"
+#include "Op/include/Mul.h"
+#include "Op/include/Neg.h"
+#include "Op/include/NonMaxSuppressionV4.h"
+#include "Op/include/NonMaxSuppressionV5.h"
+#include "Op/include/NotEqual.h"
+#include "Op/include/OneHot.h"
+#include "Op/include/Pack.h"
+#include "Op/include/Pad.h"
+#include "Op/include/PadV2.h"
+#include "Op/include/Pow.h"
+#include "Op/include/PRelu.h"
+#include "Op/include/Quantize.h"
+#include "Op/include/Range.h"
+#include "Op/include/Rank.h"
+#include "Op/include/ReduceAny.h"
+#include "Op/include/ReduceMax.h"
+#include "Op/include/ReduceMin.h"
+#include "Op/include/ReduceProd.h"
+#include "Op/include/ReLU.h"
+#include "Op/include/ReLU6.h"
+#include "Op/include/ReLUN1To1.h"
+#include "Op/include/Reshape.h"
+#include "Op/include/ResizeBilinear.h"
+#include "Op/include/ResizeNearestNeighbor.h"
+#include "Op/include/ReverseSequence.h"
+#include "Op/include/ReverseV2.h"
+#include "Op/include/Round.h"
+#include "Op/include/Rsqrt.h"
+#include "Op/include/ScatterNd.h"
+#include "Op/include/SegmentSum.h"
+#include "Op/include/Select.h"
+#include "Op/include/SelectV2.h"
+#include "Op/include/Shape.h"
+#include "Op/include/Sin.h"
+#include "Op/include/Slice.h"
+#include "Op/include/Softmax.h"
+#include "Op/include/SpaceToBatchND.h"
+#include "Op/include/SpaceToDepth.h"
+#include "Op/include/SparseToDense.h"
+#include "Op/include/Split.h"
+#include "Op/include/SplitV.h"
+#include "Op/include/Sqrt.h"
+#include "Op/include/Square.h"
+#include "Op/include/SquaredDifference.h"
+#include "Op/include/Squeeze.h"
+#include "Op/include/StridedSlice.h"
+#include "Op/include/Sub.h"
+#include "Op/include/Sum.h"
+#include "Op/include/SVDF.h"
+#include "Op/include/Tanh.h"
+#include "Op/include/Tile.h"
+#include "Op/include/TopKV2.h"
+#include "Op/include/Transpose.h"
+#include "Op/include/TransposeConv.h"
+#include "Op/include/UnidirectionalSequenceLSTM.h"
+#include "Op/include/Unique.h"
+#include "Op/include/Unpack.h"
+#include "Op/include/Where.h"
+#include "Op/include/ZerosLike.h"
 
 #endif // __TFLITE_OP_CHEFS_H__
diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
index d4605ced3..f6c6789bd 100644
--- a/compiler/tflchef/tools/file/Driver.cpp
+++ b/compiler/tflchef/tools/file/Driver.cpp
@@ -28,10 +28,8 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("recipe")
-    .type(arser::DataType::STR)
-    .help("Source recipe file path to convert");
-  arser.add_argument("tflite").type(arser::DataType::STR).help("Target tflite file path");
+  arser.add_argument("recipe").help("Source recipe file path to convert");
+  arser.add_argument("tflite").help("Target tflite file path");
 
   try
   {
diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
index 1451e8bb8..119bee6be 100644
--- a/compiler/tflchef/tools/reverse/Driver.cpp
+++ b/compiler/tflchef/tools/reverse/Driver.cpp
@@ -25,10 +25,8 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("tflite")
-    .type(arser::DataType::STR)
-    .help("Source tflite file path to convert");
-  arser.add_argument("recipe").type(arser::DataType::STR).help("Target recipe file path");
+  arser.add_argument("tflite").help("Source tflite file path to convert");
+  arser.add_argument("recipe").help("Target recipe file path");
 
   try
   {
diff --git a/compiler/tfldump/CMakeLists.txt b/compiler/tfldump/CMakeLists.txt
index fac0be6bf..410232645 100644
--- a/compiler/tfldump/CMakeLists.txt
+++ b/compiler/tfldump/CMakeLists.txt
@@ -10,6 +10,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_executable(tfldump ${DRIVER} ${SOURCES})
 target_include_directories(tfldump PRIVATE include)
 target_link_libraries(tfldump arser)
+target_link_libraries(tfldump foder)
 target_link_libraries(tfldump mio_tflite280)
 target_link_libraries(tfldump mio_tflite280_helper)
 target_link_libraries(tfldump safemain)
diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
index 38c9c062f..a3e748be1 100644
--- a/compiler/tfldump/driver/Driver.cpp
+++ b/compiler/tfldump/driver/Driver.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <arser/arser.h>
-#include <tflread/Model.h>
+#include <foder/FileLoader.h>
 #include <tfldump/Dump.h>
 
 #include <iostream>
@@ -23,7 +23,7 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser;
-  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file to dump");
+  arser.add_argument("tflite").help("TFLite file to dump");
 
   try
   {
@@ -38,14 +38,9 @@ int entry(int argc, char **argv)
 
   std::string tflite_path = arser.get<std::string>("tflite");
   // Load TF lite model from a tflite file
-  std::unique_ptr<tflread::Model> model = tflread::load_tflite(tflite_path);
-  if (model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load tflite '" << tflite_path << "'" << std::endl;
-    return 255;
-  }
-
-  const tflite::Model *tflmodel = model->model();
+  foder::FileLoader fileLoader{tflite_path};
+  std::vector<char> modelData = fileLoader.load();
+  const tflite::Model *tflmodel = tflite::GetModel(modelData.data());
   if (tflmodel == nullptr)
   {
     std::cerr << "ERROR: Failed to load tflite '" << tflite_path << "'" << std::endl;
diff --git a/compiler/tfldump/requires.cmake b/compiler/tfldump/requires.cmake
index b1abf9486..a11f6b200 100644
--- a/compiler/tfldump/requires.cmake
+++ b/compiler/tfldump/requires.cmake
@@ -1,3 +1,4 @@
 require("arser")
+require("foder")
 require("mio-tflite280")
 require("safemain")
diff --git a/compiler/tfldump/src/Dump.cpp b/compiler/tfldump/src/Dump.cpp
index 2a87e47d7..4388fcde8 100644
--- a/compiler/tfldump/src/Dump.cpp
+++ b/compiler/tfldump/src/Dump.cpp
@@ -33,7 +33,7 @@ void dump_buffer(std::ostream &os, const uint8_t *buffer, size_t size, size_t am
   std::ios_base::fmtflags saveflags(os.flags());
 
   bool second = false;
-  bool ellipsis = amount > 0 && size > 4;
+  bool ellipsis = amount > 0 && size > 8;
   size_t count = ellipsis ? std::min(size, amount) : size;
 
   for (size_t i = 0; i < count; i++)
@@ -103,8 +103,8 @@ std::ostream &operator<<(std::ostream &os, const flatbuffers::Vector<T> *fbvect)
   if (fbvect == nullptr)
     return os;
 
-  bool ellipsis = (fbvect->size() > 4);
-  auto limit_size = ellipsis ? 4 : fbvect->size();
+  bool ellipsis = (fbvect->size() > 8);
+  auto limit_size = ellipsis ? 8 : fbvect->size();
 
   if (ellipsis)
   {
diff --git a/compiler/tfldump/src/Load.cpp b/compiler/tfldump/src/Load.cpp
deleted file mode 100644
index d2f6e06f1..000000000
--- a/compiler/tfldump/src/Load.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tflread/Model.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-namespace
-{
-
-class MemoryMappedModel final : public tflread::Model
-{
-public:
-  /**
-   * @require fd and data SHOULD be valid
-   */
-  explicit MemoryMappedModel(int fd, void *data, size_t size) : _fd{fd}, _data{data}, _size{size}
-  {
-    // DO NOTHING
-  }
-
-public:
-  ~MemoryMappedModel()
-  {
-    munmap(_data, _size);
-    close(_fd);
-  }
-
-public:
-  MemoryMappedModel(const MemoryMappedModel &) = delete;
-  MemoryMappedModel(MemoryMappedModel &&) = delete;
-
-public:
-  const ::tflite::Model *model(void) const override { return ::tflite::GetModel(_data); }
-
-private:
-  int _fd = -1;
-  void *_data = nullptr;
-  size_t _size = 0;
-};
-
-class FileDescriptor final
-{
-public:
-  FileDescriptor(int value) : _value{value}
-  {
-    // DO NOTHING
-  }
-
-public:
-  // NOTE Copy is not allowed
-  FileDescriptor(const FileDescriptor &) = delete;
-
-public:
-  // NOTE Move is allowed
-  FileDescriptor(FileDescriptor &&fd) { _value = fd.release(); }
-
-public:
-  ~FileDescriptor()
-  {
-    if (_value != -1)
-    {
-      // Close on destructor
-      close(_value);
-    }
-  }
-
-public:
-  int value(void) const { return _value; }
-
-public:
-  int release(void)
-  {
-    auto res = _value;
-    _value = -1;
-    return res;
-  }
-
-private:
-  int _value = -1;
-};
-
-} // namespace
-
-namespace tflread
-{
-
-std::unique_ptr<Model> load_tflite(const std::string &path)
-{
-  FileDescriptor fd = open(path.c_str(), O_RDONLY);
-
-  if (fd.value() == -1)
-  {
-    // Return nullptr on open failure
-    return nullptr;
-  }
-
-  struct stat st;
-  if (fstat(fd.value(), &st) == -1)
-  {
-    // Return nullptr on fstat failure
-    return nullptr;
-  }
-
-  auto size = st.st_size;
-  auto data = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd.value(), 0);
-
-  if (data == MAP_FAILED)
-  {
-    // Return nullptr on mmap failure
-    return nullptr;
-  }
-
-  return std::unique_ptr<tflread::Model>{new MemoryMappedModel(fd.release(), data, size)};
-}
-
-} // namespace tflread
diff --git a/compiler/tfldump/src/OpPrinter.cpp b/compiler/tfldump/src/OpPrinter.cpp
index 47edcb086..2e8e7134f 100644
--- a/compiler/tfldump/src/OpPrinter.cpp
+++ b/compiler/tfldump/src/OpPrinter.cpp
@@ -736,6 +736,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   // There is no Option for CEIL
   _op_map[tflite::BuiltinOperator_CONCATENATION] = make_unique<ConcatenationPrinter>();
   _op_map[tflite::BuiltinOperator_CONV_2D] = make_unique<Conv2DPrinter>();
+  // There is no Option for DENSIFY
   _op_map[tflite::BuiltinOperator_DEPTH_TO_SPACE] = make_unique<DepthToSpacePrinter>();
   _op_map[tflite::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
   // There is no Option for DEQUANTIZE
diff --git a/compiler/tflite2circle-conversion-test/CMakeLists.txt b/compiler/tflite2circle-conversion-test/CMakeLists.txt
index 83fe23a8f..2e67d48bd 100644
--- a/compiler/tflite2circle-conversion-test/CMakeLists.txt
+++ b/compiler/tflite2circle-conversion-test/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index fb8c211b6..6afe1b0f2 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -36,24 +36,11 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
 
-  arser.add_argument("--version")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("Show version information and exit")
-    .exit_with(print_version);
-
-  arser.add_argument("-V", "--verbose")
-    .nargs(0)
-    .required(false)
-    .default_value(false)
-    .help("output additional information to stdout or stderr");
-
-  arser.add_argument("tflite")
-    .nargs(1)
-    .type(arser::DataType::STR)
-    .help("Source tflite file path to convert");
-  arser.add_argument("circle").nargs(1).type(arser::DataType::STR).help("Target circle file path");
+  arser::Helper::add_version(arser, print_version);
+  arser::Helper::add_verbose(arser);
+
+  arser.add_argument("tflite").help("Source tflite file path to convert");
+  arser.add_argument("circle").help("Target circle file path");
 
   try
   {
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions.h
index 88a4f71df..8149197f6 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions.h
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions.h
@@ -31,8 +31,10 @@
 #include "BuildBuiltinOptions/ConcatenationOptions.h"
 #include "BuildBuiltinOptions/Conv2DOptions.h"
 #include "BuildBuiltinOptions/CosOptions.h"
+#include "BuildBuiltinOptions/DensifyOptions.h"
 #include "BuildBuiltinOptions/DepthToSpaceOptions.h"
 #include "BuildBuiltinOptions/DepthwiseConv2DOptions.h"
+#include "BuildBuiltinOptions/DequantizeOptions.h"
 #include "BuildBuiltinOptions/DivOptions.h"
 #include "BuildBuiltinOptions/EqualOptions.h"
 #include "BuildBuiltinOptions/ExpandDimsOptions.h"
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/DensifyOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/DensifyOptions.cpp
new file mode 100644
index 000000000..4e5863576
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DensifyOptions.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DensifyOptions.h"
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::DensifyOptions>
+build_circle_DensifyOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *)
+{
+  circle::DensifyOptionsBuilder builtin_options_builder{fb};
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tfldump/include/tflread/Model.h b/compiler/tflite2circle/src/BuildBuiltinOptions/DensifyOptions.h
index c6e4a94ac..b6126c4e2 100644
--- a/compiler/tfldump/include/tflread/Model.h
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DensifyOptions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,30 +14,18 @@
  * limitations under the License.
  */
 
-#ifndef __TFLREAD_MODEL_H__
-#define __TFLREAD_MODEL_H__
+#ifndef __BBO_DENSIFY_OPTIONS_H__
+#define __BBO_DENSIFY_OPTIONS_H__
 
 #include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
 
-#include <memory>
-
-namespace tflread
-{
-
-struct Model
+namespace tflite2circle
 {
-  virtual ~Model() = default;
 
-  virtual const ::tflite::Model *model(void) const = 0;
-};
-
-/**
- * @brief Load TensorFlow Lite model (as a raw Model) from a given path
- *
- * @note May return a nullptr
- */
-std::unique_ptr<Model> load_tflite(const std::string &path);
+flatbuffers::Offset<circle::DensifyOptions>
+build_circle_DensifyOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op);
 
-} // namespace tflread
+} // namespace tflite2circle
 
-#endif // __TFLREAD_MODEL_H__
+#endif // __BBO_DENSIFY_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.cpp
new file mode 100644
index 000000000..eeacece6a
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DequantizeOptions.h"
+#include "DataLookup.h"
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::DequantizeOptions>
+build_circle_DequantizeOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op)
+{
+  circle::DequantizeOptionsBuilder builtin_options_builder{fb};
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.h
new file mode 100644
index 000000000..1cb9f9c1a
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DequantizeOptions.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_DEQUANTIZE_OPTIONS_H__
+#define __BBO_DEQUANTIZE_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::DequantizeOptions>
+build_circle_DequantizeOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_DEQUANTIZE_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/MaximumMinimumOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/MaximumMinimumOptions.cpp
index d2d2888f2..db88d3e82 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/MaximumMinimumOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/MaximumMinimumOptions.cpp
@@ -25,8 +25,6 @@ namespace tflite2circle
 flatbuffers::Offset<circle::MaximumMinimumOptions>
 build_circle_MaximumMinimumOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op)
 {
-  auto tflite_builtin_options = op->builtin_options_as_MaximumMinimumOptions();
-  assert(tflite_builtin_options);
   circle::MaximumMinimumOptionsBuilder builtin_options_builder{fb};
   return builtin_options_builder.Finish();
 }
diff --git a/compiler/tflite2circle/src/CircleModel.cpp b/compiler/tflite2circle/src/CircleModel.cpp
index d483b288f..ac017b8f1 100644
--- a/compiler/tflite2circle/src/CircleModel.cpp
+++ b/compiler/tflite2circle/src/CircleModel.cpp
@@ -344,8 +344,13 @@ template <> void Offset<OperatorCodeLink>::build(const TFLFlatBufVec *tflite_fla
     circle::OperatorCodeBuilder operator_code_builder{*_fb};
     auto de_code = it->deprecated_builtin_code();
     auto bt_code = it->builtin_code();
-    operator_code_builder.add_deprecated_builtin_code(get_circle_builtin_code(de_code));
-    operator_code_builder.add_builtin_code(get_circle_builtin_code(bt_code));
+    auto cir_de_code = get_circle_builtin_code(de_code);
+    auto cir_bt_code = get_circle_builtin_code(bt_code);
+    // correct bt_code where bt_code == 0 for old tflite format
+    if (cir_bt_code == 0)
+      cir_bt_code = static_cast<circle::BuiltinOperator>(cir_de_code);
+    operator_code_builder.add_deprecated_builtin_code(cir_de_code);
+    operator_code_builder.add_builtin_code(cir_bt_code);
     operator_code_builder.add_custom_code(custom_code);
     operator_code_builder.add_version(it->version());
     auto code = operator_code_builder.Finish();
diff --git a/compiler/tflite2circle/src/TFLBuiltinOptions.lst b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
index d55ba464a..9cbf8032a 100644
--- a/compiler/tflite2circle/src/TFLBuiltinOptions.lst
+++ b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
@@ -42,7 +42,7 @@ TFL_BUILTIN_OPTIONS(TopKV2Options)
 TFL_BUILTIN_OPTIONS(SplitOptions)
 TFL_BUILTIN_OPTIONS(LogSoftmaxOptions)
 TFL_BUILTIN_OPTIONS(CastOptions)
-//TFL_BUILTIN_OPTIONS(DequantizeOptions)
+TFL_BUILTIN_OPTIONS(DequantizeOptions)
 TFL_BUILTIN_OPTIONS(MaximumMinimumOptions)
 TFL_BUILTIN_OPTIONS(ArgMaxOptions)
 TFL_BUILTIN_OPTIONS(LessOptions)
@@ -106,3 +106,4 @@ TFL_BUILTIN_OPTIONS(RankOptions)
 TFL_BUILTIN_OPTIONS(ScatterNdOptions)
 TFL_BUILTIN_OPTIONS(SegmentSumOptions)
 TFL_BUILTIN_OPTIONS(BatchMatMulOptions)
+TFL_BUILTIN_OPTIONS(DensifyOptions)
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
index 3841a1b78..93c33cdbd 100644
--- a/compiler/vconone/CMakeLists.txt
+++ b/compiler/vconone/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT VCONONE_VERSION)
-  set(VCONONE_VERSION 0x0000000000140001)
+  set(VCONONE_VERSION 0x0000000000150001)
   # NOTE order is [build patch minor major]
   # if VCONONE_VERSION is set with -D option, it will be cached
   # you may have to remove cache file if you remove -D option
diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
index d94a7ada6..cebf7d998 100644
--- a/compiler/vconone/src/version.cpp
+++ b/compiler/vconone/src/version.cpp
@@ -54,7 +54,7 @@ std::string get_string(void)
 std::string get_copyright(void)
 {
   std::string str;
-  str = "Copyright (c) 2020-2021 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
+  str = "Copyright (c) 2020-2022 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
   str += "Licensed under the Apache License, Version 2.0\r\n";
   str += "https://github.com/Samsung/ONE";
   return str;
author	Chunseok Lee <chunseok.lee@samsung.com>	2022-09-07 19:04:21 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2022-09-07 19:04:21 +0900
commit	c690d52bdd137ed6a17353aa7af35e8141ece77b (patch)
tree	dbb7dd99133132dfbffcb8c9e9af4f1ffc2f4808 /compiler
parent	3ad689f0803519e343c36d5700646e86059df961 (diff)
download	nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.tar.gz nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.tar.bz2 nnfw-c690d52bdd137ed6a17353aa7af35e8141ece77b.zip