Imported Upstream version 1.18.0upstream/1.18.0 submit/tizen/20211028.014856 submit/tizen/20211019.023737 accepted/tizen/unified/20211101.140244

author: Chunseok Lee <chunseok.lee@samsung.com> 2021-10-19 11:32:46 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2021-10-19 11:32:46 +0900
commit: 33ae5d70a1ed85d215c1293ed63afbf3517b07d5 (patch)
tree: 9f1ace0f4760a8f7903ef15e2e92f1d1401e4b1e
parent: f4cf19e579a19c5346ccb2aad55bfd251065e447 (diff)
download: nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.tar.gz
nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.tar.bz2
nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.zip
504 files changed, 17146 insertions, 3618 deletions
diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
index 9c0a8d881..86d272d8a 100644
--- a/.ahub/tcchecker-tca/config.yaml
+++ b/.ahub/tcchecker-tca/config.yaml
@@ -25,21 +25,6 @@ test:
         any: true
       - extension: cc
         any: true
-      - excludes :
-        - DepthwiseConv2D.cc
-        - ArgMinMax.cc
-        - AveragePool2D.cc
-        - Concat.cc
-        - DepthToSpace.cc
-        - DepthwiseConv2D.cc
-        - Fill.cc
-        - If.cc
-        - Pad.cc
-        - Reduce.cc
-        - ResizeBilinear.c
-        - Slice.cc
-        - Softmax.cc
-        - While.cc
     testCase:
       - condition:
         - functionName:
diff --git a/compiler/arser/include/arser/arser.h b/compiler/arser/include/arser/arser.h
index f2a7a2b85..1703e421e 100644
--- a/compiler/arser/include/arser/arser.h
+++ b/compiler/arser/include/arser/arser.h
@@ -238,6 +238,18 @@ public:
     return *this;
   }
 
+  Argument &accumulated(void)
+  {
+    _is_accumulated = true;
+    return *this;
+  }
+
+  Argument &accumulated(bool value)
+  {
+    _is_accumulated = value;
+    return *this;
+  }
+
   Argument &help(std::string help_message)
   {
     _help_message = help_message;
@@ -296,7 +308,9 @@ private:
   std::function<void(void)> _func;
   uint32_t _nargs{1};
   bool _is_required{false};
+  bool _is_accumulated{false};
   std::vector<std::string> _values;
+  std::vector<std::vector<std::string>> _accum_values;
 
   friend class Arser;
   friend std::ostream &operator<<(std::ostream &, const Arser &);
@@ -403,6 +417,8 @@ public:
         throw std::runtime_error("Invalid arguments. Positional argument must always be required.");
       }
     }
+    // TODO accumulated arguments shouldn't be enabled to positional arguments.
+    // TODO accumulated arguments shouldn't be enabled to optional arguments whose `narg` == 0.
   }
 
   void parse(int argc, char **argv)
@@ -475,6 +491,11 @@ public:
                                      "You must have missed some argument.");
           arg->second->_values.emplace_back(argv[c++]);
         }
+        // accumulate values
+        if (arg->second->_is_accumulated)
+        {
+          arg->second->_accum_values.emplace_back(arg->second->_values);
+        }
         if (arg->second->_nargs == 0)
         {
           // TODO std::boolalpha for true or false
@@ -493,6 +514,9 @@ public:
     if (arg == _arg_map.end())
       return false;
 
+    if (arg->second->_is_accumulated)
+      return arg->second->_accum_values.size() > 0 ? true : false;
+
     return arg->second->_values.size() > 0 ? true : false;
   }
 
@@ -500,6 +524,9 @@ public:
 
   template <typename T> std::vector<T> get_impl(const std::string &arg_name, std::vector<T> *);
 
+  template <typename T>
+  std::vector<std::vector<T>> get_impl(const std::string &arg_name, std::vector<std::vector<T>> *);
+
   template <typename T> T get(const std::string &arg_name);
 
   friend std::ostream &operator<<(std::ostream &stream, const Arser &parser)
@@ -617,6 +644,12 @@ template <typename T> T Arser::get_impl(const std::string &arg_name, T *)
                              "There is no argument you are looking for: " +
                              arg_name);
 
+  if (arg->second->_is_accumulated)
+    throw std::runtime_error(
+      "Type mismatch. "
+      "You called get using a type different from the one you specified."
+      "Accumulated argument is returned as std::vector of the specified type");
+
   if (arg->second->_type != TypeName<T>::Get())
     throw std::runtime_error("Type mismatch. "
                              "You called get() method with a type different "
@@ -640,6 +673,22 @@ template <typename T> std::vector<T> Arser::get_impl(const std::string &arg_name
                              "There is no argument you are looking for: " +
                              arg_name);
 
+  // Accumulated arguments with scalar type (e.g., STR)
+  if (arg->second->_is_accumulated)
+  {
+    if (arg->second->_type != TypeName<T>::Get())
+      throw std::runtime_error("Type mismatch. "
+                               "You called get using a type different from the one you specified.");
+
+    std::vector<T> data;
+    for (auto values : arg->second->_accum_values)
+    {
+      assert(values.size() == 1);
+      data.emplace_back(internal::lexical_cast<T>(values[0]));
+    }
+    return data;
+  }
+
   if (arg->second->_type != TypeName<std::vector<T>>::Get())
     throw std::runtime_error("Type mismatch. "
                              "You called get using a type different from the one you specified.");
@@ -650,6 +699,39 @@ template <typename T> std::vector<T> Arser::get_impl(const std::string &arg_name
   return data;
 }
 
+// Accumulated arguments with vector type (e.g., STR_VEC)
+template <typename T>
+std::vector<std::vector<T>> Arser::get_impl(const std::string &arg_name,
+                                            std::vector<std::vector<T>> *)
+{
+  auto arg = _arg_map.find(arg_name);
+  if (arg == _arg_map.end())
+    throw std::runtime_error("Invalid argument. "
+                             "There is no argument you are looking for: " +
+                             arg_name);
+
+  if (not arg->second->_is_accumulated)
+    throw std::runtime_error("Type mismatch. "
+                             "You called get using a type different from the one you specified.");
+
+  if (arg->second->_type != TypeName<std::vector<T>>::Get())
+    throw std::runtime_error(
+      "Type mismatch. "
+      "You called get using a type different from the one you specified."
+      "Accumulated argument is returned as std::vector of the specified type");
+
+  std::vector<std::vector<T>> result;
+  for (auto values : arg->second->_accum_values)
+  {
+    std::vector<T> data;
+    std::transform(values.begin(), values.end(), std::back_inserter(data),
+                   [](std::string str) -> T { return internal::lexical_cast<T>(str); });
+    result.emplace_back(data);
+  }
+
+  return result;
+}
+
 template <typename T> T Arser::get(const std::string &arg_name)
 {
   return get_impl(arg_name, static_cast<T *>(nullptr));
diff --git a/compiler/arser/tests/arser.test.cpp b/compiler/arser/tests/arser.test.cpp
index b37d0dec3..4e88f0cb7 100644
--- a/compiler/arser/tests/arser.test.cpp
+++ b/compiler/arser/tests/arser.test.cpp
@@ -93,7 +93,7 @@ TEST(BasicTest, OptionalArgument)
   EXPECT_THROW(arser.get<bool>("--volume"), std::runtime_error);
 }
 
-TEST(BasicTest, NonRequiredOptionalArgument)
+TEST(BasicTest, NonRequiredOptionalArgument_NEG)
 {
   /* arrange */
   Arser arser;
@@ -111,7 +111,7 @@ TEST(BasicTest, NonRequiredOptionalArgument)
   EXPECT_THROW(arser.get<int>("--weight"), std::runtime_error);
 }
 
-TEST(BasicTest, RequiredOptionalArgument)
+TEST(BasicTest, RequiredOptionalArgument_NEG)
 {
   /* arrange */
   Arser arser;
@@ -395,7 +395,7 @@ TEST(BasicTest, shortMultipleOption)
   EXPECT_EQ("I/am/out.put", arser.get<std::string>("--output_path"));
 }
 
-TEST(BasicTest, OptWithRequiredDuplicate)
+TEST(BasicTest, OptWithRequiredDuplicate_NEG)
 {
   /* arrange */
   Arser arser;
@@ -441,3 +441,61 @@ TEST(BasicTest, OptWithNonRequiredDuplicate)
   EXPECT_TRUE(arser["--output_path"]);
   EXPECT_EQ("I/am/out.put", arser.get<std::string>("--output_path"));
 }
+
+TEST(BasicTest, AccumulateVectorOptions)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--specify").nargs(3).accumulated(true).type(arser::DataType::STR_VEC);
+
+  Prompt prompt("./driver --specify a b c --specify 1 2 3");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--specify"]);
+
+  auto specify = arser.get<std::vector<std::vector<std::string>>>("--specify");
+  auto first = specify[0];
+  EXPECT_EQ("a", first.at(0));
+  EXPECT_EQ("b", first.at(1));
+  EXPECT_EQ("c", first.at(2));
+  auto second = specify[1];
+  EXPECT_EQ("1", second.at(0));
+  EXPECT_EQ("2", second.at(1));
+  EXPECT_EQ("3", second.at(2));
+}
+
+TEST(BasicTest, AccumulateScalarOptions)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--specify").nargs(1).accumulated(true).type(arser::DataType::FLOAT);
+
+  Prompt prompt("./driver --specify 1 --specify 2");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--specify"]);
+
+  auto specify = arser.get<std::vector<float>>("--specify");
+  EXPECT_EQ(1, specify.at(0));
+  EXPECT_EQ(2, specify.at(1));
+}
+
+TEST(BasicTest, AccumulateScalarOptions_WrongType_NEG)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--specify").nargs(1).accumulated(true).type(arser::DataType::FLOAT);
+
+  Prompt prompt("./driver --specify 1 --specify 2");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--specify"]);
+
+  EXPECT_THROW(arser.get<float>("--specify"), std::runtime_error);
+}
diff --git a/compiler/circle-opselector/CMakeLists.txt b/compiler/circle-opselector/CMakeLists.txt
new file mode 100644
index 000000000..93ab84c09
--- /dev/null
+++ b/compiler/circle-opselector/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(DRIVER "driver/Driver.cpp")
+
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_executable(circle-opselector ${DRIVER} ${SOURCES})
+target_include_directories(circle-opselector PRIVATE src)
+target_link_libraries(circle-opselector foder)
+target_link_libraries(circle-opselector safemain)
+target_link_libraries(circle-opselector loco)
+target_link_libraries(circle-opselector luci_import)
+target_link_libraries(circle-opselector luci_export)
+target_link_libraries(circle-opselector arser)
+target_link_libraries(circle-opselector vconone)
+target_link_libraries(circle-opselector luci_service)
+target_link_libraries(circle-opselector luci_profile)
+
+install(TARGETS circle-opselector DESTINATION bin)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(circle-opselector-test ${TESTS} ${SOURCES} ${DRIVER})
+target_include_directories(circle-opselector-test PRIVATE src)
+target_link_libraries(circle-opselector-test foder)
+target_link_libraries(circle-opselector-test loco)
+target_link_libraries(circle-opselector-test luci_import)
+target_link_libraries(circle-opselector-test luci_export)
+target_link_libraries(circle-opselector-test arser)
+target_link_libraries(circle-opselector-test vconone)
+target_link_libraries(circle-opselector-test luci_service)
+target_link_libraries(circle-opselector-test luci_profile)
diff --git a/compiler/circle-opselector/README.md b/compiler/circle-opselector/README.md
new file mode 100644
index 000000000..c06899ab5
--- /dev/null
+++ b/compiler/circle-opselector/README.md
@@ -0,0 +1,21 @@
+# circle-opselector
+
+`circle-opselector` is a tool for creating new circle models by selecting nodes from a model.
+
+## Example
+
+### 1. Select from location numbers
+
+```bash
+./circle-opselector --by_id "1-3,5" input.circle output.circle
+```
+
+Then, output.circle which has node 1, 2, 3 and 5 will be created.
+
+### 2. Select from node names
+
+```bash
+./circle-opselector --by_name "Add_1,Sub_1,Concat_2" input.circle output.circle
+```
+
+Then, output.circle which has node Add_1, Sub_1 and Concat_2 will be created.
diff --git a/compiler/circle-opselector/driver/Driver.cpp b/compiler/circle-opselector/driver/Driver.cpp
new file mode 100644
index 000000000..a1ace4f58
--- /dev/null
+++ b/compiler/circle-opselector/driver/Driver.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModuleIO.h"
+
+#include <luci/Profile/CircleNodeID.h>
+
+#include <arser/arser.h>
+#include <vconone/vconone.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <numeric>
+#include <sstream>
+
+void print_version(void)
+{
+  std::cout << "circle-opselector version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
+std::vector<std::string> split_into_vector(const std::string &str, const char &delim)
+{
+  std::vector<std::string> ret;
+  std::istringstream is(str);
+  for (std::string item; std::getline(is, item, delim);)
+  {
+    ret.push_back(item);
+  }
+
+  // remove empty string
+  ret.erase(std::remove_if(ret.begin(), ret.end(), [](const std::string &s) { return s.empty(); }),
+            ret.end());
+
+  return ret;
+}
+
+bool is_number(const std::string &s)
+{
+  return !s.empty() && std::find_if(s.begin(), s.end(),
+                                    [](unsigned char c) { return !std::isdigit(c); }) == s.end();
+}
+
+bool is_number(const std::vector<std::string> &vec)
+{
+  for (const auto &s : vec)
+  {
+    if (not::is_number(s))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * @brief  Segmentation function for user's '--by_id' input
+ *
+ * @note   This function tokenizes the input data.s
+ *         First, divide it into ',', and if token has '-', devide it once more into '-'.
+ *         For example, if user input is '12,34,56', it is devided into [12,34,56].
+ *         If input is '1-2,34,56', it is devided into [[1,2],34,56].
+ *         And '-' means range so, if input is '2-7', it means all integer between 2-7.
+ */
+std::vector<uint32_t> split_id_input(const std::string &str)
+{
+  std::vector<uint32_t> by_id;
+
+  // tokenize colon-separated string
+  auto colon_tokens = ::split_into_vector(str, ',');
+  if (colon_tokens.empty()) // input empty line like "".
+  {
+    std::cerr << "ERROR: Nothing was entered." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  for (const auto &ctok : colon_tokens)
+  {
+    auto dash_tokens = ::split_into_vector(ctok, '-');
+    if (not::is_number(dash_tokens))
+    {
+      std::cerr << "ERROR: To select operator by id, please use these args: [0-9], '-', ','"
+                << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    // convert string into integer
+    std::vector<uint32_t> int_tokens;
+    try
+    {
+      std::transform(dash_tokens.begin(), dash_tokens.end(), std::back_inserter(int_tokens),
+                     [](const std::string &str) { return static_cast<uint32_t>(std::stoi(str)); });
+    }
+    catch (const std::out_of_range &)
+    {
+      // if input is big integer like '123467891234', stoi throw this exception.
+      std::cerr << "ERROR: Argument is out of range." << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    catch (...)
+    {
+      std::cerr << "ERROR: Unknown error" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    switch (int_tokens.size())
+    {
+      case 0: // inputs like "-"
+      {
+        std::cerr << "ERROR: Nothing was entered" << std::endl;
+        exit(EXIT_FAILURE);
+      }
+      case 1: // inputs like "1", "2"
+      {
+        by_id.push_back(int_tokens.at(0));
+        break;
+      }
+      case 2: // inputs like "1-2", "11-50"
+      {
+        for (uint32_t i = int_tokens.at(0); i <= int_tokens.at(1); i++)
+        {
+          by_id.push_back(i);
+        }
+        break;
+      }
+      default: // inputs like "1-2-3"
+      {
+        std::cerr << "ERROR: Too many '-' in str." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  return by_id;
+}
+
+std::vector<std::string> split_name_input(const std::string &str)
+{
+  return ::split_into_vector(str, ',');
+}
+
+int entry(int argc, char **argv)
+{
+  // TODO Add new option names!
+
+  arser::Arser arser("circle-opselector provides selecting operations in circle model");
+
+  arser.add_argument("--version")
+    .nargs(0)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
+
+  // TODO Add new options!
+
+  arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
+  arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+
+  // select option
+  arser.add_argument("--by_id")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Input operation id to select nodes.");
+  arser.add_argument("--by_name")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Input operation name to select nodes.");
+
+  try
+  {
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cerr << err.what() << std::endl;
+    std::cout << arser;
+    return EXIT_FAILURE;
+  }
+
+  std::string input_path = arser.get<std::string>("input");
+  std::string output_path = arser.get<std::string>("output");
+
+  std::string operator_input;
+
+  std::vector<uint32_t> by_id;
+  std::vector<std::string> by_name;
+
+  if (!arser["--by_id"] && !arser["--by_name"] || arser["--by_id"] && arser["--by_name"])
+  {
+    std::cerr << "ERROR: Either option '--by_id' or '--by_name' must be specified" << std::endl;
+    std::cerr << arser;
+    return EXIT_FAILURE;
+  }
+
+  if (arser["--by_id"])
+  {
+    operator_input = arser.get<std::string>("--by_id");
+    by_id = split_id_input(operator_input);
+  }
+  if (arser["--by_name"])
+  {
+    operator_input = arser.get<std::string>("--by_name");
+    by_name = split_name_input(operator_input);
+  }
+
+  // Import original circle file.
+  auto module = opselector::getModule(input_path);
+
+  // Select nodes from user input.
+  std::vector<const luci::CircleNode *> selected_nodes;
+
+  // put selected nodes into vector.
+  if (by_id.size())
+  {
+    loco::Graph *graph = module.get()->graph(0); // get main subgraph.
+
+    for (auto node : loco::all_nodes(graph))
+    {
+      auto cnode = loco::must_cast<const luci::CircleNode *>(node);
+
+      try
+      {
+        auto node_id = luci::get_node_id(cnode); // if the node is not operator, throw runtime_error
+
+        for (auto selected_id : by_id)
+          if (selected_id == node_id) // find the selected id
+            selected_nodes.emplace_back(cnode);
+      }
+      catch (std::runtime_error)
+      {
+        continue;
+      }
+    }
+  }
+  if (by_name.size())
+  {
+    loco::Graph *graph = module.get()->graph(0); // get main subgraph.
+
+    for (auto node : loco::all_nodes(graph))
+    {
+      auto cnode = loco::must_cast<const luci::CircleNode *>(node);
+      std::string node_name = cnode->name();
+
+      for (auto selected_name : by_name)
+        if (selected_name.compare(node_name) == 0) // find the selected name
+          selected_nodes.emplace_back(cnode);
+    }
+  }
+  if (selected_nodes.size() == 0)
+  {
+    std::cerr << "ERROR: No operator selected" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // TODO implement node selections
+
+  // Export to output Circle file
+  assert(opselector::exportModule(module.get(), output_path));
+
+  return 0;
+}
diff --git a/compiler/circle-opselector/requires.cmake b/compiler/circle-opselector/requires.cmake
new file mode 100644
index 000000000..dcdbcbb68
--- /dev/null
+++ b/compiler/circle-opselector/requires.cmake
@@ -0,0 +1,6 @@
+require("foder")
+require("loco")
+require("safemain")
+require("luci")
+require("arser")
+require("vconone")
diff --git a/compiler/circle-opselector/src/Driver.test.cpp b/compiler/circle-opselector/src/Driver.test.cpp
new file mode 100644
index 000000000..6e569085e
--- /dev/null
+++ b/compiler/circle-opselector/src/Driver.test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Driver.test.h"
+#include "TestHelper.h"
+
+#include <gtest/gtest.h>
+
+TEST(DriverTest, NoArg_NEG)
+{
+  Argv<1> argv;
+  argv.add("circle-opselector");
+
+  ::testing::internal::CaptureStderr();
+  ::testing::internal::CaptureStdout();
+  int result = entry(1, argv.argv());
+  ::testing::internal::GetCapturedStdout();
+  ASSERT_EQ(EXIT_FAILURE, result);
+}
+
+TEST(DriverTest, Wrong_ID_NEG)
+{
+  std::string str1 = "1";
+  std::string empty = "";
+  std::string no_integer = "1531538X5";
+
+  ASSERT_EQ(true, is_number(str1));
+  ASSERT_EQ(false, is_number(empty));
+  ASSERT_EQ(false, is_number(no_integer));
+}
+
+TEST(DriverTest, Split)
+{
+  std::vector<uint32_t> vec1;
+  std::vector<uint32_t> vec2;
+
+  std::string hyphen = "1-3,8-10";
+  std::string comma = "1,2,3";
+
+  vec1.push_back(1);
+  vec1.push_back(2);
+  vec1.push_back(3);
+  vec1.push_back(8);
+  vec1.push_back(9);
+  vec1.push_back(10);
+
+  vec2.push_back(1);
+  vec2.push_back(2);
+  vec2.push_back(3);
+
+  ASSERT_EQ(vec1, split_id_input(hyphen));
+  ASSERT_EQ(vec2, split_id_input(comma));
+}
diff --git a/compiler/circle-opselector/src/Driver.test.h b/compiler/circle-opselector/src/Driver.test.h
new file mode 100644
index 000000000..06f151649
--- /dev/null
+++ b/compiler/circle-opselector/src/Driver.test.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_OPSELECTOR_DRIVER_TEST_H__
+#define __CIRCLE_OPSELECTOR_DRIVER_TEST_H__
+
+#include <vector>
+#include <string>
+
+int entry(int argc, char **argv);
+bool is_number(const std::string &s);
+std::vector<uint32_t> split_id_input(const std::string &str);
+
+#endif // __CIRCLE_OPSELECTOR_DRIVER_TEST_H__
diff --git a/compiler/circle-opselector/src/ModuleIO.cpp b/compiler/circle-opselector/src/ModuleIO.cpp
new file mode 100644
index 000000000..46f45ceb0
--- /dev/null
+++ b/compiler/circle-opselector/src/ModuleIO.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModuleIO.h"
+
+#include <foder/FileLoader.h>
+
+#include <luci/Importer.h>
+#include <luci/CircleExporter.h>
+#include <luci/CircleFileExpContract.h>
+
+#include <iostream>
+
+namespace opselector
+{
+
+std::unique_ptr<luci::Module> getModule(std::string &input_path)
+{
+  // Load model from the file
+  foder::FileLoader file_loader{input_path};
+  std::vector<char> model_data = file_loader.load();
+
+  // Verify flatbuffers
+  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
+  if (!circle::VerifyModelBuffer(verifier))
+  {
+    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  const circle::Model *circle_model = circle::GetModel(model_data.data());
+  if (circle_model == nullptr)
+  {
+    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // Import from input Circle file
+  luci::Importer importer;
+
+  return importer.importModule(circle_model);
+}
+
+bool exportModule(luci::Module *module, std::string &output_path)
+{
+  luci::CircleExporter exporter;
+
+  luci::CircleFileExpContract contract(module, output_path);
+
+  if (!exporter.invoke(&contract))
+  {
+    std::cerr << "ERROR: Failed to export '" << output_path << "'" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace opselector
diff --git a/compiler/circle-opselector/src/ModuleIO.h b/compiler/circle-opselector/src/ModuleIO.h
new file mode 100644
index 000000000..39c704bf3
--- /dev/null
+++ b/compiler/circle-opselector/src/ModuleIO.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_OPSELECTOR_MODULEIO_H__
+#define __CIRCLE_OPSELECTOR_MODULEIO_H__
+
+#include <luci/IR/Module.h>
+
+#include <string>
+#include <memory>
+
+namespace opselector
+{
+
+std::unique_ptr<luci::Module> getModule(std::string &input_path);
+bool exportModule(luci::Module *module, std::string &output_path);
+
+} // namespace opselector
+
+#endif // __CIRCLE_OPSELECTOR_MODULEIO_H__
diff --git a/compiler/circle-opselector/src/ModuleIO.test.cpp b/compiler/circle-opselector/src/ModuleIO.test.cpp
new file mode 100644
index 000000000..a1e5c2070
--- /dev/null
+++ b/compiler/circle-opselector/src/ModuleIO.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModuleIO.h"
+
+#include <gtest/gtest.h>
+
+TEST(ModuleIOTest, Export_nullptr)
+{
+  std::string output_path = "./test.out.circle";
+
+  ASSERT_EQ(false, opselector::exportModule(nullptr, output_path));
+}
diff --git a/compiler/circle-opselector/src/TestHelper.h b/compiler/circle-opselector/src/TestHelper.h
new file mode 100644
index 000000000..966e2b219
--- /dev/null
+++ b/compiler/circle-opselector/src/TestHelper.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_OPSELECTOR_TEST_HELPER_H__
+#define __CIRCLE_OPSELECTOR_TEST_HELPER_H__
+
+#include <cassert>
+#include <string.h>
+
+template <size_t N> class Argv
+{
+public:
+  typedef char *pchar_t;
+
+public:
+  ~Argv()
+  {
+    for (size_t n = 0; n < _ptr; ++n)
+      delete _argv[n];
+  }
+
+  void add(const char *in)
+  {
+    assert(_ptr < N);
+    _argv[_ptr] = new char[strlen(in) + 1];
+    strncpy(_argv[_ptr], in, strlen(in) + 1);
+    _ptr++;
+  }
+
+  pchar_t *argv(void) { return _argv; }
+
+private:
+  pchar_t _argv[N] = {
+    nullptr,
+  };
+  size_t _ptr = 0;
+};
+
+#endif // __CIRCLE_OPSELECTOR_TEST_HELPER_H__
diff --git a/compiler/circle-part-value-test/CMakeLists.txt b/compiler/circle-part-value-test/CMakeLists.txt
index b4b1b19db..1cfbcbd9b 100644
--- a/compiler/circle-part-value-test/CMakeLists.txt
+++ b/compiler/circle-part-value-test/CMakeLists.txt
@@ -106,7 +106,7 @@ add_dependencies(circle_part_value_test_prepare common_artifacts_deps)
 add_test(NAME circle_part_value_test
   COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/part_eval_all.sh"
           "${CMAKE_CURRENT_BINARY_DIR}"
-          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "${NNCC_OVERLAY_DIR}/venv_2_6_0"
           "$<TARGET_FILE:circle_part_driver>"
           ${PARTITION_LIST}
 )
diff --git a/compiler/circle-partitioner/README.md b/compiler/circle-partitioner/README.md
index e1a0258dc..5fd312e33 100644
--- a/compiler/circle-partitioner/README.md
+++ b/compiler/circle-partitioner/README.md
@@ -49,8 +49,8 @@ DIV=acl_cl
 - `backends`: Existing partition group names which nodes should be placed, in CSV format.
 - `default`: Default group name which should be one of `backends` item.
 - `comply`: How to group nodes of the model.
-   - currently `opcode` is supported
-   - future work: set group by node name or sequence number.
+   - currently `opcode` and `opname` are supported
+   - future work: set group by sequence number.
 
 ##### `[OPCODE`] section
 
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index 5e717d085..1a09a8a2a 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -43,6 +43,7 @@ void print_exclusive_options(void)
   std::cout << "    --quantize_dequantize_weights" << std::endl;
   std::cout << "    --quantize_with_minmax" << std::endl;
   std::cout << "    --requantize" << std::endl;
+  std::cout << "    --force_quantparam" << std::endl;
 }
 
 void print_version(void)
@@ -63,6 +64,7 @@ int entry(int argc, char **argv)
   const std::string qdqw = "--quantize_dequantize_weights";
   const std::string qwmm = "--quantize_with_minmax";
   const std::string rq = "--requantize";
+  const std::string fq = "--force_quantparam";
 
   const std::string gpd = "--generate_profile_data";
 
@@ -105,6 +107,15 @@ int entry(int argc, char **argv)
           "Two arguments required: input_dtype(int8) "
           "output_dtype(uint8)");
 
+  arser.add_argument(fq)
+    .nargs(3)
+    .type(arser::DataType::STR_VEC)
+    .required(false)
+    .accumulated(true)
+    .help("Write quantization parameters to the specified tensor. "
+          "Three arguments required: tensor_name(string), "
+          "scale(float) zero_point(int)");
+
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
 
@@ -123,10 +134,11 @@ int entry(int argc, char **argv)
   }
 
   {
-    // only one of qdqw, qwmm, rq option can be used
+    // only one of qdqw, qwmm, rq, fq option can be used
     int32_t opt_used = arser[qdqw] ? 1 : 0;
     opt_used += arser[qwmm] ? 1 : 0;
     opt_used += arser[rq] ? 1 : 0;
+    opt_used += arser[fq] ? 1 : 0;
     if (opt_used != 1)
     {
       print_exclusive_options();
@@ -185,6 +197,34 @@ int entry(int argc, char **argv)
     options->param(AlgorithmParameters::Quantize_output_dtype, values.at(1));
   }
 
+  if (arser[fq])
+  {
+    auto values = arser.get<std::vector<std::vector<std::string>>>(fq);
+
+    std::vector<std::string> tensors;
+    std::vector<std::string> scales;
+    std::vector<std::string> zero_points;
+
+    for (auto const value : values)
+    {
+      if (value.size() != 3)
+      {
+        std::cerr << arser;
+        return 255;
+      }
+
+      tensors.push_back(value[0]);
+      scales.push_back(value[1]);
+      zero_points.push_back(value[2]);
+    }
+
+    options->enable(Algorithms::ForceQuantParam);
+
+    options->params(AlgorithmParameters::Quantize_tensor_names, tensors);
+    options->params(AlgorithmParameters::Quantize_scales, scales);
+    options->params(AlgorithmParameters::Quantize_zero_points, zero_points);
+  }
+
   std::string input_path = arser.get<std::string>("input");
   std::string output_path = arser.get<std::string>("output");
 
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 95822c758..f41aac303 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -41,6 +41,7 @@ Add(Net_Maximum_Minimum_000 PASS transform_min_max_to_relu6)
 Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
 Add(MatMul_000 PASS resolve_customop_matmul)
 Add(DepthwiseConv2D_003 PASS)
+Add(PadV2_001 PASS substitute_padv2_to_pad)
 Add(StridedSlice_003 PASS substitute_strided_slice_to_reshape)
 Add(MaxPoolWithArgmax_000 PASS resolve_customop_max_pool_with_argmax)
 Add(MaxPoolWithArgmax_001 PASS resolve_customop_max_pool_with_argmax)
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 1998b1646..a5ddb26dc 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -98,6 +98,12 @@ int entry(int argc, char **argv)
     .default_value(false)
     .help("This will fold dequantize op");
 
+  arser.add_argument("--fold_dwconv")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fold Depthwise Convolution operator with constant inputs");
+
   arser.add_argument("--fold_sparse_to_dense")
     .nargs(0)
     .required(false)
@@ -116,6 +122,12 @@ int entry(int argc, char **argv)
     .default_value(false)
     .help("This will fuse Activation function to a preceding operator");
 
+  arser.add_argument("--fuse_add_with_fully_connected")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse Add operator to FullyConnected operator");
+
   arser.add_argument("--fuse_add_with_tconv")
     .nargs(0)
     .required(false)
@@ -282,6 +294,12 @@ int entry(int argc, char **argv)
     .default_value(false)
     .help("This will convert certain condition PadV2 to Pad");
 
+  arser.add_argument("--substitute_splitv_to_split")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert certain condition SplitV to Split operator");
+
   arser.add_argument("--substitute_squeeze_to_reshape")
     .nargs(0)
     .required(false)
@@ -300,6 +318,12 @@ int entry(int argc, char **argv)
     .default_value(false)
     .help("This will convert single input Transpose to Reshape");
 
+  arser.add_argument("--expand_broadcast_const")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will expand broadcastable constant inputs");
+
   arser.add_argument("--convert_nchw_to_nhwc")
     .nargs(0)
     .required(false)
@@ -426,6 +450,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FoldCast);
   if (arser.get<bool>("--fold_dequantize"))
     options->enable(Algorithms::FoldDequantize);
+  if (arser.get<bool>("--fold_dwconv"))
+    options->enable(Algorithms::FoldDepthwiseConv2D);
   if (arser.get<bool>("--fold_sparse_to_dense"))
     options->enable(Algorithms::FoldSparseToDense);
   if (arser.get<bool>("--forward_reshape_to_unaryop"))
@@ -434,6 +460,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseActivationFunction);
   if (arser.get<bool>("--fuse_batchnorm_with_conv"))
     options->enable(Algorithms::FuseBatchNormWithConv);
+  if (arser.get<bool>("--fuse_add_with_fully_connected"))
+    options->enable(Algorithms::FuseAddWithFullyConnected);
   if (arser.get<bool>("--fuse_add_with_tconv"))
     options->enable(Algorithms::FuseAddWithTConv);
   if (arser.get<bool>("--fuse_batchnorm_with_dwconv"))
@@ -486,6 +514,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::SubstitutePackToReshape);
   if (arser.get<bool>("--substitute_padv2_to_pad"))
     options->enable(Algorithms::SubstitutePadV2ToPad);
+  if (arser.get<bool>("--substitute_splitv_to_split"))
+    options->enable(Algorithms::SubstituteSplitVToSplit);
   if (arser.get<bool>("--substitute_squeeze_to_reshape"))
     options->enable(Algorithms::SubstituteSqueezeToReshape);
   if (arser.get<bool>("--substitute_strided_slice_to_reshape"))
@@ -496,6 +526,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::TransformMinMaxToRelu6Pass);
   if (arser.get<bool>("--transform_min_relu_to_relu6"))
     options->enable(Algorithms::TransformMinReluToRelu6Pass);
+  if (arser.get<bool>("--expand_broadcast_const"))
+    options->enable(Algorithms::ExpandBroadcastConst);
 
   if (arser.get<bool>("--mute_warnings"))
     settings->set(luci::UserSettings::Key::MuteWarnings, true);
diff --git a/compiler/circledump/CMakeLists.txt b/compiler/circledump/CMakeLists.txt
index 8ef68370d..7848ac722 100644
--- a/compiler/circledump/CMakeLists.txt
+++ b/compiler/circledump/CMakeLists.txt
@@ -11,6 +11,6 @@ target_include_directories(circledump PRIVATE include)
 target_link_libraries(circledump arser)
 target_link_libraries(circledump mio_circle)
 target_link_libraries(circledump safemain)
-target_link_libraries(circledump flatbuffers)
+target_link_libraries(circledump flatbuffers-1.10)
 
 install(TARGETS circledump DESTINATION bin)
diff --git a/compiler/common-artifacts/CMakeLists.txt b/compiler/common-artifacts/CMakeLists.txt
index edca29b34..6de634a25 100644
--- a/compiler/common-artifacts/CMakeLists.txt
+++ b/compiler/common-artifacts/CMakeLists.txt
@@ -17,6 +17,8 @@ set(VIRTUALENV_OVERLAY_TF_1_13_2 "${NNCC_OVERLAY_DIR}/venv_1_13_2")
 
 # Create python virtual environment with tensorflow 2.3.0
 set(VIRTUALENV_OVERLAY_TF_2_3_0 "${NNCC_OVERLAY_DIR}/venv_2_3_0")
+# Create python virtual environment with tensorflow 2.6.0
+set(VIRTUALENV_OVERLAY_TF_2_6_0 "${NNCC_OVERLAY_DIR}/venv_2_6_0")
 
 add_custom_command(
   OUTPUT ${VIRTUALENV_OVERLAY_TF_1_13_2}
@@ -27,11 +29,16 @@ add_custom_command(
   OUTPUT ${VIRTUALENV_OVERLAY_TF_2_3_0}
   COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY_TF_2_3_0}
 )
+add_custom_command(
+  OUTPUT ${VIRTUALENV_OVERLAY_TF_2_6_0}
+  COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY_TF_2_6_0}
+)
 
 # Create requirements.txt and install required pip packages
 set(REQUIREMENTS_FILE "requirements.txt")
 set(REQUIREMENTS_OVERLAY_PATH_TF_1_13_2 "${VIRTUALENV_OVERLAY_TF_1_13_2}/${REQUIREMENTS_FILE}")
 set(REQUIREMENTS_OVERLAY_PATH_TF_2_3_0 "${VIRTUALENV_OVERLAY_TF_2_3_0}/${REQUIREMENTS_FILE}")
+set(REQUIREMENTS_OVERLAY_PATH_TF_2_6_0 "${VIRTUALENV_OVERLAY_TF_2_6_0}/${REQUIREMENTS_FILE}")
 
 # TODO remove version number of '--upgrade pip==20.2.1 setuptools==49.3.0'
 # NOTE adding version is for temporary hotfix of setuptools 50.x.y version
@@ -53,8 +60,23 @@ add_custom_command(
   DEPENDS ${VIRTUALENV_OVERLAY_TF_2_3_0}
 )
 
+add_custom_command(
+  OUTPUT ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
+  COMMAND ${CMAKE_COMMAND} -E remove -f ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
+  COMMAND ${CMAKE_COMMAND} -E echo "tensorflow-cpu==2.6.0" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
+  COMMAND ${CMAKE_COMMAND} -E echo "flatbuffers==1.12" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_6_0}/bin/python -m pip --default-timeout=1000 install --upgrade pip==20.2.1 setuptools==49.3.0
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_6_0}/bin/python -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0} --upgrade
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_2_6_0}
+)
+
 add_custom_target(common_artifacts_python_deps ALL
-  DEPENDS ${VIRTUALENV_OVERLAY_TF_1_13_2} ${VIRTUALENV_OVERLAY_TF_2_3_0} ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2} ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_1_13_2}
+          ${VIRTUALENV_OVERLAY_TF_2_3_0}
+          ${VIRTUALENV_OVERLAY_TF_2_6_0}
+          ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2}
+          ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+          ${REQUIREMENTS_OVERLAY_PATH_TF_2_6_0}
 )
 
 #[[ Generate common resources ]]
diff --git a/compiler/enco/frontend/tflite/CMakeLists.txt b/compiler/enco/frontend/tflite/CMakeLists.txt
index ea10fbc4b..b2de2b34b 100644
--- a/compiler/enco/frontend/tflite/CMakeLists.txt
+++ b/compiler/enco/frontend/tflite/CMakeLists.txt
@@ -1,4 +1,4 @@
-nnas_find_package(FlatBuffers QUIET)
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
 
 if(NOT FlatBuffers_FOUND)
   return()
@@ -17,7 +17,7 @@ add_library(enco_tflite_frontend SHARED ${SOURCES})
 target_include_directories(enco_tflite_frontend PRIVATE src)
 target_link_libraries(enco_tflite_frontend enco_intf_frontend)
 target_link_libraries(enco_tflite_frontend enco_intf_cmdline)
-target_link_libraries(enco_tflite_frontend flatbuffers)
+target_link_libraries(enco_tflite_frontend flatbuffers-1.10)
 target_link_libraries(enco_tflite_frontend enco_tflite_schema)
 target_link_libraries(enco_tflite_frontend morph)
 target_link_libraries(enco_tflite_frontend cwrap)
diff --git a/compiler/exo/CMakeLists.txt b/compiler/exo/CMakeLists.txt
index e686cbb83..9d02f7cba 100644
--- a/compiler/exo/CMakeLists.txt
+++ b/compiler/exo/CMakeLists.txt
@@ -1,4 +1,4 @@
-nnas_find_package(FlatBuffers QUIET)
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
 
 if(NOT FlatBuffers_FOUND)
   message(STATUS "Build exo: FALSE (missing FlatBuffers)")
diff --git a/compiler/luci-interpreter/CMakeLists.txt b/compiler/luci-interpreter/CMakeLists.txt
index ab4ec1f43..1f7acee87 100644
--- a/compiler/luci-interpreter/CMakeLists.txt
+++ b/compiler/luci-interpreter/CMakeLists.txt
@@ -4,4 +4,12 @@ if (NOT LUCI_INTERPRETER_PAL_DIR)
     set(LUCI_INTERPRETER_PAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pal/linux")
 endif()
 
+set(KERNEL_REGISTER_FILE ${LUCI_INTERPRETER_PAL_DIR}/KernelsToBuild.lst)
+
+if (NOT DEFINED CUSTOM_LUCI_INTERPRETER_SUFFIX)
+    set(LUCI_INTERPRETER_SUFFIX "")
+else()
+    set(LUCI_INTERPRETER_SUFFIX ${CUSTOM_LUCI_INTERPRETER_SUFFIX})
+endif()
+
 add_subdirectory(src)
diff --git a/compiler/luci-interpreter/README.md b/compiler/luci-interpreter/README.md
new file mode 100644
index 000000000..4a9a34e6d
--- /dev/null
+++ b/compiler/luci-interpreter/README.md
@@ -0,0 +1,158 @@
+# luci-interpreter
+
+`luci-interpreter` is an inference engine for neural networks represented in luci IR.
+See `compiler/luci/lang` directory for details about IR.
+You can find useful infrastructure, like importer/exporter, optimizations in `compiler/luci`.
+
+`luci-interpreter` provides:
+- Basic inference functionality, input setters and output getters
+- Interface for inspecting hidden interpreter state, like activation values during inference
+- Customization mechanisms to fit the interpreter to specific platforms, like MCUs
+
+Public interface headers are placed in `luci-interpreter/include/luci_interpreter` directory
+
+## Basic usage
+
+Minimal usage includes:
+- Setting input data
+- Running inference
+- Fetching inference results
+
+Interpreter object is reusable and can run multiple inferences.
+Elements in tensors (input/output/internal) are stored contiguously and have C-like layout:
+This means for tensor t=[[0, 1],[2, 3]], t[0,1] == 1.
+
+Input and output tensors have the same indexes as in original luci model. 
+
+**Usage example:**
+``` c++
+// Note getTensorSize is a function that computes tensor size,
+// it is not part of interpreter and should be implemented by user 
+
+luci_interpreter::Interpreter interpreter(luci_module);
+
+// Set inputs
+// assuming model has only one input and one output
+const auto input_nodes = loco::input_nodes(module->graph());
+
+const auto *input_node = dynamic_cast<const luci::CircleInput *>(input_nodes[0]);
+std::vector<char> input_data(getTensorSize(input_node));
+// Initialize input data here
+
+interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+
+// Start inference
+interpreter.interpret();
+
+// Fetch inference results
+const auto output_nodes = loco::output_nodes(module->graph());
+const auto *output_node = dynamic_cast<const luci::CircleOutput *>(output_nodes[0]);
+std::vector<char> output_data(getTensorSize(output_node));
+interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
+```
+
+## Inspecting intermediate state
+
+Interpreter provides interfaces to investigate internal state of interpreter during inference.
+
+This is done by "observer" mechanism:
+- `Interpreter` class has `attachObserver` method, which takes pointer to `ExecutionObserver` object
+- `ExecutionObserver` defines several callback methods user can override to inject custom code
+
+ExecutionObserver provides three callbacks:
+- `postTensorWrite` checks contents of output tensor after operation execution
+- `preOperatorExecute` notifies that interpreter is going to execute operation
+- `postOperatorExecute` notifies that interpreter has finished execution of an operation
+
+See `luci-interpreter/include/luci_interpreter/Interpreter.h` for this interface details.
+
+**Usage example:**
+``` c++
+class CustomExecutionObserver: public luci_interpreter::ExecutionObserver
+{
+public:
+  void postTensorWrite(const luci::CircleNode *node, const Tensor *tensor) override
+  {
+    if (tensor->element_type() != loco::DataType::FLOAT32)
+      return;
+    for (int i = 0; i < tensor->shape().num_elements(); ++i)
+      std::cout << tensor->data<float>[i] << ", ";
+  }
+
+  // User observer can override only needed methods,
+  // others will inherit empty implementation from base observer.
+
+  // void preOperatorExecute(const luci::CircleNode *node);
+  // void postOperatorExecute(const luci::CircleNode *node);
+};
+
+luci_interpreter::Interpreter interpreter(module);
+CustomExecutionObserver observer;
+interpreter.attachObserver(&observer);
+
+// initialize input_data
+interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+
+interpreter.interpret();
+```
+
+## Customizing inference
+
+### Memory manager
+
+Interpreter provides a handle for altering default memory management mechanisms.
+
+This is done by `MemoryManger` interface, see `luci-interpreter/include/luci_interpreter/MemoryManager.h` for implementation details.
+
+This header contains `IMemoryManager` abstract class which is responsible for allocation and dealocation of tensors' memory.
+
+User can construct an interpreter with one of predefined memory managers or their own custom memory manager.
+Note that one memory manager could be shared between multiple interpreter instances, because an interpreter does not own the manager object. 
+
+List of predefined memory managers:
+- `SimpleMemoryManager` This is a simple wrapper around new/delete, default one.
+- `TestMemoryManager` Memorizes all allocated memory and releases it in Manager desctuctor, used in kernel unit tests.
+- `BuddyMemoryManager` Implements Buddy algorithm, uses external buffer for tensor data allocations, does not need new/delete.
+- `StaticMemoryManger` Uses precomputed memory allocation plan. Requires preparation with MemoryPlanner, but could reduce memory consumption in restricted environments (like MCUs).
+
+**SimpleMemoryManager usage example:**
+
+No need to select anything, to use this memory manager.
+``` c++
+luci_interpreter::Interpreter interpreter(module);
+```
+
+**TestMemoryManager usage example:**
+
+``` c++
+luci_interpreter::TestMemoryManager mm;
+luci_interpreter::Interpreter interpreter(module, &mm);
+```
+
+**BuddyMemoryManager usage example:**
+
+`BuddyMemoryManager` implements a classic allocation algorithm: https://en.wikipedia.org/wiki/Buddy_memory_allocation.
+
+This allocator uses an external buffer as a memory pool. That allows to use static memory arrays for allocations.
+
+Limitations
+- Current implementation uses only lower power-of-two bytes of given buffer.
+
+  For example for 1000 bytes buffer, only lower 512 bytes will be used.
+- Current implementation can handle maximum 4 gigabyte memory pool
+
+``` c++
+  constexpr int buffer_size = 2048;
+  static uint8_t buffer[buffer_size];
+  luci_interpreter::BuddyMemoryManager memory_manager(buffer, buffer_size);
+  luci_interpreter::Interpreter interpreter(module.get(), &memory_manager);
+```
+
+**StaticMemoryManager usage example:**
+``` c++
+TBD when it is merged
+```
+
+## Further reading
+
+If you want to participate in development, please read `DEVELOPER.md` for SW architecture details.
diff --git a/compiler/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h b/compiler/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h
new file mode 100644
index 000000000..205baa626
--- /dev/null
+++ b/compiler/luci-interpreter/include/luci_interpreter/BuddyMemoryManager.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/MemoryManager.h"
+
+#ifndef LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
+
+namespace luci_interpreter
+{
+
+class BuddyMemoryManager : public IMemoryManager
+{
+public:
+  BuddyMemoryManager(uint8_t *memory_start, int32_t memSize);
+
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+private:
+  struct Block
+  {
+    Block *next_free;
+    bool is_free;
+    uint32_t size;
+    // debug field
+    Block *self;
+  };
+
+  Block *_start_block;
+  int32_t _num_blocks;
+  uint32_t _size;
+  Block *_free_blocks[32]{};
+
+  static int32_t lowerLog2(uint32_t val)
+  {
+    int32_t i = 0;
+    while (val >>= 1)
+      i++;
+
+    return i;
+  }
+
+  void addToBlocks(Block *block, int32_t l)
+  {
+    if (!block)
+      return;
+
+    block->next_free = _free_blocks[l];
+    _free_blocks[l] = block;
+  }
+
+  void removeFromBlocks(const Block *block, int32_t l)
+  {
+    if (!block)
+      return;
+
+    Block *tmp = _free_blocks[l];
+
+    if (block == tmp)
+    {
+      _free_blocks[l] = block->next_free;
+      return;
+    }
+
+    while (tmp)
+    {
+      if (tmp->next_free == block)
+      {
+        tmp->next_free = block->next_free;
+        return;
+      }
+
+      tmp = tmp->next_free;
+    }
+  }
+
+  void divideBlock(Block *block, int32_t l)
+  {
+    int32_t size = ((block->size + sizeof(Block)) / 2) - sizeof(Block);
+
+    removeFromBlocks(block, l);
+
+    // there is no need to add to the free_blocks list here
+    block->is_free = true;
+    block->size = size;
+    block->self = block;
+
+    Block *buddy;
+    buddy = (Block *)((uint8_t *)block + sizeof(Block) + size);
+    buddy->is_free = true;
+    buddy->size = size;
+    buddy->self = buddy;
+
+    addToBlocks(buddy, l - 1);
+  }
+
+  Block *mergeBlock(Block *block)
+  {
+    Block *buddy;
+
+    const int32_t l = lowerLog2(block->size + sizeof(Block));
+
+    const int64_t address = ((uint8_t *)block - (uint8_t *)_start_block);
+    buddy = (Block *)((address ^ (1 << l)) + (uint8_t *)_start_block);
+
+    if (!buddy->is_free || buddy->size != block->size)
+      return nullptr;
+
+    if (block > buddy)
+    {
+      Block *x = block;
+      block = buddy;
+      buddy = x;
+    }
+
+    removeFromBlocks(block, l);
+    removeFromBlocks(buddy, l);
+
+    block->size = block->size * 2 + sizeof(Block);
+    block->is_free = true;
+    block->self = block;
+
+    addToBlocks(block, l + 1);
+
+    return block;
+  }
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_BUDDY_MEMORY_MANAGER_H
diff --git a/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h b/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h
index 7a14bf6f8..7dee8a7f2 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h
@@ -22,6 +22,7 @@
 #include <luci/IR/Nodes/CircleInput.h>
 #include <luci/IR/Nodes/CircleOutput.h>
 
+#include "luci_interpreter/MemoryManager.h"
 #include <luci/IR/Module.h>
 
 #include <memory>
@@ -49,7 +50,7 @@ public:
 class Interpreter
 {
 public:
-  explicit Interpreter(const luci::Module *module);
+  explicit Interpreter(const luci::Module *module, IMemoryManager *memory_manager = nullptr);
 
   ~Interpreter();
 
@@ -64,7 +65,11 @@ public:
   const Tensor *getTensor(const loco::Node *node) { return _node_to_tensor[node]; }
 
 private:
+  // _default_memory_manager should be before _runtime_module due to
+  // the order of deletion in the destructor
+  std::unique_ptr<IMemoryManager> _default_memory_manager = nullptr;
   std::unique_ptr<class RuntimeModule> _runtime_module;
+  IMemoryManager *_memory_manager = nullptr;
 
   // Observer functionality support.
   std::unique_ptr<struct RuntimeToIR> _runtime_to_ir;
diff --git a/compiler/luci-interpreter/include/luci_interpreter/MemoryManager.h b/compiler/luci-interpreter/include/luci_interpreter/MemoryManager.h
new file mode 100644
index 000000000..f32c52095
--- /dev/null
+++ b/compiler/luci-interpreter/include/luci_interpreter/MemoryManager.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_MEMORY_MANAGER_H
+
+#include "luci_interpreter/core/DataType.h"
+#include "luci_interpreter/core/Tensor.h"
+
+namespace luci_interpreter
+{
+
+class IMemoryManager
+{
+public:
+  virtual void allocate_memory(luci_interpreter::Tensor &tensor) = 0;
+  virtual void release_memory(luci_interpreter::Tensor &tensor) = 0;
+
+  virtual ~IMemoryManager() = default;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_MEMORY_MANAGER_H
diff --git a/compiler/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h b/compiler/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h
new file mode 100644
index 000000000..658a1c609
--- /dev/null
+++ b/compiler/luci-interpreter/include/luci_interpreter/SimpleMemoryManager.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+
+class SimpleMemoryManager : public IMemoryManager
+{
+public:
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_SIMPLE_MEMORY_MANAGER_H
diff --git a/compiler/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h b/compiler/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h
new file mode 100644
index 000000000..ded7bde79
--- /dev/null
+++ b/compiler/luci-interpreter/include/luci_interpreter/StaticMemoryManager.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+
+// Used for allocations in static buffer, using offsets defined in luci model.
+class StaticMemoryManager : public IMemoryManager
+{
+public:
+  StaticMemoryManager() = delete;
+
+  explicit StaticMemoryManager(uint8_t *buffer_ptr) : _buffer_ptr(buffer_ptr)
+  { /* Do nothing */
+  }
+
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+private:
+  // Stores a pointer to the beginning of the allocated memory buffer.
+  uint8_t *_buffer_ptr;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_STATIC_MEMORY_MANAGER_H
diff --git a/compiler/luci-interpreter/include/luci_interpreter/TestMemoryManager.h b/compiler/luci-interpreter/include/luci_interpreter/TestMemoryManager.h
new file mode 100644
index 000000000..397bbed76
--- /dev/null
+++ b/compiler/luci-interpreter/include/luci_interpreter/TestMemoryManager.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
+#define LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
+
+#include "luci_interpreter/MemoryManager.h"
+
+namespace luci_interpreter
+{
+// Memory Manager for using in kernels tests. This eliminates the need to manually delete the
+// allocated memory in tests. This mem_manager remembers all its allocations and in destructor
+// delete all allocations.
+class TestMemoryManager : public IMemoryManager
+{
+public:
+  void allocate_memory(luci_interpreter::Tensor &tensor) final;
+  void release_memory(luci_interpreter::Tensor &tensor) final;
+
+  ~TestMemoryManager() override
+  {
+    for (auto allocation : allocations)
+    {
+      delete[] allocation;
+    }
+  }
+
+private:
+  std::vector<uint8_t *> allocations;
+};
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_TEST_MEMORY_MANAGER_H
diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
index e356bce92..bb9ff6d4a 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -107,9 +107,6 @@ public:
     return _quantization.zero_point[0];
   }
 
-  void allocate();
-  void deallocate();
-
   const std::vector<float> &scales() const { return _quantization.scale; }
 
   const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
@@ -118,15 +115,16 @@ public:
 
   template <typename T> const T *data() const
   {
-    assert(_data_allocated);
-    return reinterpret_cast<const T *>(_data.get());
+    static_assert(std::is_same<uint8_t, char>::value or
+                  std::is_same<uint8_t, unsigned char>::value);
+    return reinterpret_cast<const T *>(_data);
   }
 
   template <typename T> T *data()
   {
-    if (!_data_allocated)
-      allocate();
-    return reinterpret_cast<T *>(_data.get());
+    static_assert(std::is_same<uint8_t, char>::value or
+                  std::is_same<uint8_t, unsigned char>::value);
+    return reinterpret_cast<T *>(_data);
   }
 
   const std::string &name() const { return _name; }
@@ -137,13 +135,50 @@ public:
 
   void resize(const Shape &new_shape);
 
+  void set_data_buffer(uint8_t *buffer)
+  {
+    if (buffer == nullptr)
+    {
+      _data_allocated = false;
+    }
+    else
+    {
+      _data_allocated = true;
+    }
+    _data = buffer;
+  }
+
+  bool is_observable() const { return _is_observable; }
+
+  void set_observable(bool value) { _is_observable = value; }
+
+  bool is_allocatable() const { return _is_allocatable; }
+
+  void set_allocatable(bool value) { _is_allocatable = value; }
+
+  bool is_data_allocated() const { return _data_allocated; }
+
+  int32_t get_offset() const { return _offset; }
+
+  void set_offset(int32_t offset) { _offset = offset; }
+
 private:
   DataType _element_type;
   Shape _shape;
   AffineQuantization _quantization;
-  std::unique_ptr<uint8_t[]> _data;
+  uint8_t *_data;
   std::string _name;
   bool _data_allocated;
+  // Write of tensor is reported to registered Observers only if this tensor is observable
+  // This is needed for tensors used in kernel implementation, but not present in original model.
+  bool _is_observable = true;
+  // Memory manager is called for tensor only if it is "allocatable".
+  // Kernel configuration could disable allocation of some tensors if they are not needed for
+  // particular operation.
+  bool _is_allocatable = true;
+  // Used by static memory manager.
+  // Stores the offset from the beginning of the allocated memory buffer.
+  int32_t _offset = -1;
 };
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
new file mode 100644
index 000000000..9d541276c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -0,0 +1,68 @@
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LocalResponseNormalization)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(LogSoftmax)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Mean)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(Pack)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(Pow)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Relu)
+REGISTER_KERNEL(Relu6)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(ReverseV2)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Slice)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(Split)
+REGISTER_KERNEL(SplitV)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(Unpack)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-interpreter/pal/linux/PALArgMax.h b/compiler/luci-interpreter/pal/linux/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+                             const T2 *axis, const tflite::RuntimeShape &output_shape,
+                             T3 *output_data, const std::greater<T1> cmp)
+{
+  tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-interpreter/pal/linux/PALBatchToSpaceND.h b/compiler/luci-interpreter/pal/linux/PALBatchToSpaceND.h
new file mode 100644
index 000000000..3fe2022ed
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::BatchToSpaceND(
+    unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-interpreter/pal/linux/PALConv2d.h
new file mode 100644
index 000000000..2550dd5d7
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALConv2d.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const float *input_data, const tflite::RuntimeShape &filter_shape,
+                        const float *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const float *bias_data, const tflite::RuntimeShape &output_shape,
+                        float *output_data, const tflite::RuntimeShape &im2col_shape,
+                        float *im2col_data)
+{
+  if (im2col_data)
+  {
+    tflite::optimized_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                                bias_shape, bias_data, output_shape, output_data, im2col_shape,
+                                im2col_data);
+  }
+  else
+    tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                                bias_shape, bias_data, output_shape, output_data,
+                                tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+                        const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                        uint8 *output_data, const tflite::RuntimeShape &im2col_shape,
+                        uint8 *im2col_data)
+{
+  // TODO This should only be done once (although it takes only a few microseconds).
+  //  Also, the user should be able to adjust the number of threads.
+  auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
+  gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
+
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data, im2col_shape,
+                              im2col_data, gemmlowp_context.get());
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+                                  const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                                  const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+                                  const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                                  const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                                  int8 *output_data, const tflite::RuntimeShape &im2col_shape,
+                                  int8 *im2col_data)
+{
+  (void)im2col_shape;
+  (void)im2col_data;
+  // TODO enable optimized version
+  tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+                                                filter_shape, filter_data, bias_shape, bias_data,
+                                                output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-interpreter/pal/linux/PALDepthToSpace.h b/compiler/luci-interpreter/pal/linux/PALDepthToSpace.h
new file mode 100644
index 000000000..f9ebfcfb5
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/pal/linux/PALElu.h b/compiler/luci-interpreter/pal/linux/PALElu.h
new file mode 100644
index 000000000..cb365ffd0
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALElu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                       const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-interpreter/pal/linux/PALL2Normalize.h b/compiler/luci-interpreter/pal/linux/PALL2Normalize.h
new file mode 100644
index 000000000..6c663e21f
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+                                   const tflite::RuntimeShape &input_shape, const T *input_data,
+                                   const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+                                         output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-interpreter/pal/linux/PALL2Pool2D.h b/compiler/luci-interpreter/pal/linux/PALL2Pool2D.h
new file mode 100644
index 000000000..aac57f2b2
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+                          const T *input_data, const tflite::RuntimeShape &output_shape,
+                          T *output_data)
+{
+  tflite::optimized_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-interpreter/pal/linux/PALLeakyRelu.h b/compiler/luci-interpreter/pal/linux/PALLeakyRelu.h
new file mode 100644
index 000000000..e8209bae6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+                             const tflite::RuntimeShape &input_shape, const float *input_data,
+                             const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-interpreter/pal/linux/PALLocalResponseNormalization.h b/compiler/luci-interpreter/pal/linux/PALLocalResponseNormalization.h
new file mode 100644
index 000000000..54f7f0916
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALLocalResponseNormalization.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
+#define LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                           const tflite::RuntimeShape &input_shape, const float *input_data,
+                           const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::LocalResponseNormalization(op_params, input_shape, input_data,
+                                                    output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LOCALRESPONSENORMALIZATION_H
diff --git a/compiler/luci-interpreter/pal/linux/PALLogSoftmax.h b/compiler/luci-interpreter/pal/linux/PALLogSoftmax.h
new file mode 100644
index 000000000..a32e3eec6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALLogSoftmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
+#define LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  tflite::optimized_ops::PopulateSoftmaxLookupTable(data, input_scale, beta);
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  // Do nothing for linux
+  (void)params;
+  (void)input_scale;
+  (void)beta;
+}
+
+static inline void LogSoftmax(const tflite::SoftmaxParams &params, float input_scale,
+                              const tflite::RuntimeShape &input_shape, const uint8 *input_data,
+                              const tflite::RuntimeShape &output_shape, uint8 *output_data)
+{
+  tflite::optimized_ops::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
+                                    output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LOGSOFTMAX_H
diff --git a/compiler/luci-interpreter/pal/linux/PALMul.h b/compiler/luci-interpreter/pal/linux/PALMul.h
new file mode 100644
index 000000000..cfaec1b58
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALMul.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                       const float *input1_data, const tflite::RuntimeShape &input2_shape,
+                       const float *input2_data, const tflite::RuntimeShape &output_shape,
+                       float *output_data)
+{
+  tflite::optimized_ops::Mul(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+
+static inline void BroadcastMul4DSlow(tflite::ArithmeticParams &params,
+                                      const tflite::RuntimeShape &input1_shape,
+                                      const float *input1_data,
+                                      const tflite::RuntimeShape &input2_shape,
+                                      const float *input2_data,
+                                      const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-interpreter/pal/linux/PALNeg.h b/compiler/luci-interpreter/pal/linux/PALNeg.h
new file mode 100644
index 000000000..797ffee1b
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-interpreter/pal/linux/PALRelu.h b/compiler/luci-interpreter/pal/linux/PALRelu.h
new file mode 100644
index 000000000..b4c715d3e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALRelu.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RELU_H
+#define LUCI_INTERPRETER_PAL_RELU_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Relu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                        const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Relu(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void ReluX(const tflite::ReluParams &params, const tflite::RuntimeShape &input_shape,
+                         const T *input_data, const tflite::RuntimeShape &output_shape,
+                         T *output_data)
+{
+  tflite::optimized_ops::ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RELU_H
diff --git a/compiler/luci-interpreter/pal/linux/PALRelu6.h b/compiler/luci-interpreter/pal/linux/PALRelu6.h
new file mode 100644
index 000000000..bf2f91aa5
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALRelu6.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RELU6_H
+#define LUCI_INTERPRETER_PAL_RELU6_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Relu6(const tflite::RuntimeShape &input_shape, const float *input_data,
+                         const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::optimized_ops::Relu6(input_shape, input_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void ReluX(const tflite::ReluParams &params, const tflite::RuntimeShape &input_shape,
+                         const T *input_data, const tflite::RuntimeShape &output_shape,
+                         T *output_data)
+{
+  tflite::optimized_ops::ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RELU6_H
diff --git a/compiler/luci-interpreter/pal/linux/PALResizeBilinear.h b/compiler/luci-interpreter/pal/linux/PALResizeBilinear.h
new file mode 100644
index 000000000..7380081dc
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/optimized/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+               const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+               const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+                                        output_size_shape, output_size_data,
+                                        unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h b/compiler/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..74d19265b
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+                      const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+                      const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+                      const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+                                               output_size_shape, output_size_data,
+                                               unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSlice.h b/compiler/luci-interpreter/pal/linux/PALSlice.h
new file mode 100644
index 000000000..640a71684
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSlice.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SLICE_H
+#define LUCI_INTERPRETER_PAL_SLICE_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Slice(const tflite::SliceParams &op_params,
+                         const tflite::RuntimeShape &input_shape, const T *input_data,
+                         const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::Slice(op_params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SLICE_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSoftmax.h b/compiler/luci-interpreter/pal/linux/PALSoftmax.h
new file mode 100644
index 000000000..b197e79d1
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSoftmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  tflite::optimized_ops::PopulateSoftmaxLookupTable(data, input_scale, beta);
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  // Do nothing for linux
+  (void)params;
+  (void)input_scale;
+  (void)beta;
+}
+
+template <typename In, typename Out>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+                           const tflite::RuntimeShape &input_shape, const In *input_data,
+                           const tflite::RuntimeShape &output_shape, Out *output_data)
+{
+  tflite::optimized_ops::Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSpaceToBatchND.h b/compiler/luci-interpreter/pal/linux/PALSpaceToBatchND.h
new file mode 100644
index 000000000..5e8de9ba3
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+               const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::SpaceToBatchND(
+    params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSpaceToDepth.h b/compiler/luci-interpreter/pal/linux/PALSpaceToDepth.h
new file mode 100644
index 000000000..52d2a5bb1
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::optimized_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSplit.h b/compiler/luci-interpreter/pal/linux/PALSplit.h
new file mode 100644
index 000000000..4d8da72d8
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSplit.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPLIT_H
+#define LUCI_INTERPRETER_PAL_SPLIT_H
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename Scalar>
+static inline void Split(const tflite::SplitParams &params, const tflite::RuntimeShape &input_shape,
+                         const Scalar *input_data, const tflite::RuntimeShape *const *output_shapes,
+                         Scalar *const *output_data)
+{
+  tflite::optimized_ops::Split(params, input_shape, input_data, output_shapes, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPLIT_H
diff --git a/compiler/luci-interpreter/pal/linux/PALSub.h b/compiler/luci-interpreter/pal/linux/PALSub.h
new file mode 100644
index 000000000..04080d619
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+                       const tflite::RuntimeShape &input1_shape, const T *input1_data,
+                       const tflite::RuntimeShape &input2_shape, const T *input2_data,
+                       const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::optimized_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-interpreter/pal/linux/pal.cmake b/compiler/luci-interpreter/pal/linux/pal.cmake
index da880c64c..84349e0bf 100644
--- a/compiler/luci-interpreter/pal/linux/pal.cmake
+++ b/compiler/luci-interpreter/pal/linux/pal.cmake
@@ -1,8 +1,8 @@
 macro(initialize_pal)
-    nnas_find_package(TensorFlowSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowEigenSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowRuySource EXACT 2.3.0 QUIET)
+    nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
 
     if (NOT TensorFlowSource_FOUND)
         message(STATUS "Skipping luci-interpreter: TensorFlow not found")
@@ -43,7 +43,12 @@ macro(add_pal_to_target TGT)
     set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
     add_library(luci_interpreter_linux_pal STATIC ${PAL_SOURCES})
     set_target_properties(luci_interpreter_linux_pal PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_include_directories(luci_interpreter_linux_pal SYSTEM PRIVATE "${TensorFlowSource_DIR}")
+    target_include_directories(luci_interpreter_linux_pal SYSTEM PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}"
+    )
 
     target_link_libraries(${TGT} PRIVATE Threads::Threads luci_interpreter_linux_pal)
 endmacro()
diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
new file mode 100644
index 000000000..771974afe
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -0,0 +1,56 @@
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-interpreter/pal/mcu/PALArgMax.h b/compiler/luci-interpreter/pal/mcu/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+                             const T2 *axis, const tflite::RuntimeShape &output_shape,
+                             T3 *output_data, const std::greater<T1> cmp)
+{
+  tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h b/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
new file mode 100644
index 000000000..4dd77ffdc
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::BatchToSpaceND(
+    unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALConv2d.h b/compiler/luci-interpreter/pal/mcu/PALConv2d.h
new file mode 100644
index 000000000..0a8ae4e48
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALConv2d.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/conv.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const float *input_data, const tflite::RuntimeShape &filter_shape,
+                        const float *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const float *bias_data, const tflite::RuntimeShape &output_shape,
+                        float *output_data, const tflite::RuntimeShape &im2col_shape,
+                        float *im2col_data)
+{
+  (void)im2col_shape;
+  (void)im2col_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data,
+                              tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+                        const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+                        const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                        const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                        uint8 *output_data, const tflite::RuntimeShape &im2col_shape,
+                        uint8 *im2col_data)
+{
+  (void)im2col_shape;
+  (void)im2col_data;
+  tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+                              bias_shape, bias_data, output_shape, output_data, im2col_shape,
+                              im2col_data, nullptr);
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+                                  const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                                  const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+                                  const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+                                  const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                                  int8 *output_data, const tflite::RuntimeShape &im2col_shape,
+                                  int8 *im2col_data)
+{
+  (void)im2col_shape;
+  (void)im2col_data;
+  tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+                                                filter_shape, filter_data, bias_shape, bias_data,
+                                                output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h b/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h
new file mode 100644
index 000000000..8463e571e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/reference/depth_to_space.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALElu.h b/compiler/luci-interpreter/pal/mcu/PALElu.h
new file mode 100644
index 000000000..4089d0a0c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALElu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/elu.h>
+
+namespace luci_interpreter_pal
+{
+
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+                       const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h b/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h
new file mode 100644
index 000000000..f84742a44
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/reference/l2normalization.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+                                   const tflite::RuntimeShape &input_shape, const T *input_data,
+                                   const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+                                         output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h b/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h
new file mode 100644
index 000000000..38a302fc6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+                          const T *input_data, const tflite::RuntimeShape &output_shape,
+                          T *output_data)
+{
+  tflite::reference_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h b/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h
new file mode 100644
index 000000000..9ccd2224f
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+                             const tflite::RuntimeShape &input_shape, const float *input_data,
+                             const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALMul.h b/compiler/luci-interpreter/pal/mcu/PALMul.h
new file mode 100644
index 000000000..2b46b100c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALMul.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/reference/mul.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+                       const float *input1_data, const tflite::RuntimeShape &input2_shape,
+                       const float *input2_data, const tflite::RuntimeShape &output_shape,
+                       float *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+
+static inline void BroadcastMul4DSlow(tflite::ArithmeticParams &params,
+                                      const tflite::RuntimeShape &input1_shape,
+                                      const float *input1_data,
+                                      const tflite::RuntimeShape &input2_shape,
+                                      const float *input2_data,
+                                      const tflite::RuntimeShape &output_shape, float *output_data)
+{
+  tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+                                            input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALNeg.h b/compiler/luci-interpreter/pal/mcu/PALNeg.h
new file mode 100644
index 000000000..be5903a0c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/reference/neg.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+                          const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h b/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h
new file mode 100644
index 000000000..cc9f0fd54
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+               const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+               const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+                                        output_size_shape, output_size_data,
+                                        unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h b/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..f4d5a6ed3
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+                      const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+                      const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+                      const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+                                               output_size_shape, output_size_data,
+                                               unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSoftmax.h b/compiler/luci-interpreter/pal/mcu/PALSoftmax.h
new file mode 100644
index 000000000..9838b542d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSoftmax.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+                                              float beta)
+{
+  // Do nothing for mcu
+  (void)data;
+  (void)input_scale;
+  (void)beta;
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
+                                   &input_beta_multiplier, &input_beta_left_shift);
+
+  params->input_multiplier = input_beta_multiplier;
+  params->input_left_shift = input_beta_left_shift;
+  params->diff_min =
+    -tflite::CalculateInputRadius(kScaledDiffIntegerBits, params->input_left_shift);
+}
+
+template <typename T>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+                           const tflite::RuntimeShape &input_shape, const T *input_data,
+                           const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  // MARK: At this moment this operation doesn't support on mcu
+  assert(false && "Softmax NYI");
+  (void)params;
+  (void)input_shape;
+  (void)input_data;
+  (void)output_shape;
+  (void)output_data;
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h b/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
new file mode 100644
index 000000000..fdddaa929
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+               const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+               const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+               const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+               const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToBatchND(
+    params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+    unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h b/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h
new file mode 100644
index 000000000..816b7f663
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_depth.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+                                const tflite::RuntimeShape &unextended_input_shape,
+                                const T *input_data,
+                                const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+  tflite::reference_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+                                      unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSub.h b/compiler/luci-interpreter/pal/mcu/PALSub.h
new file mode 100644
index 000000000..ea57578c6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/reference/sub.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+                       const tflite::RuntimeShape &input1_shape, const T *input1_data,
+                       const tflite::RuntimeShape &input2_shape, const T *input2_data,
+                       const tflite::RuntimeShape &output_shape, T *output_data)
+{
+  tflite::reference_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+                             output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-interpreter/pal/mcu/pal.cmake b/compiler/luci-interpreter/pal/mcu/pal.cmake
index 2307ac727..a479d407b 100644
--- a/compiler/luci-interpreter/pal/mcu/pal.cmake
+++ b/compiler/luci-interpreter/pal/mcu/pal.cmake
@@ -1,8 +1,8 @@
 macro(initialize_pal)
-    nnas_find_package(TensorFlowSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowEigenSource EXACT 2.3.0 QUIET)
-    nnas_find_package(TensorFlowRuySource EXACT 2.3.0 QUIET)
+    nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+    nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
 
     if (NOT TensorFlowSource_FOUND)
         message(STATUS "Skipping luci-interpreter: TensorFlow not found")
@@ -30,7 +30,7 @@ endmacro()
 
 macro(add_pal_to_target TGT)
     target_include_directories(${TGT} PRIVATE "${PAL}")
-    target_include_directories(${TGT} SYSTEM PRIVATE
+    target_include_directories(${TGT} PRIVATE
             "${TensorFlowRuySource_DIR}"
             "${TensorFlowGEMMLowpSource_DIR}"
             "${TensorFlowEigenSource_DIR}"
@@ -42,7 +42,12 @@ macro(add_pal_to_target TGT)
     set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
     add_library(luci_interpreter_mcu_pal STATIC ${PAL_SOURCES})
     set_target_properties(luci_interpreter_mcu_pal PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_include_directories(luci_interpreter_mcu_pal SYSTEM PRIVATE "${TensorFlowSource_DIR}")
+    target_include_directories(luci_interpreter_mcu_pal PRIVATE
+            "${TensorFlowRuySource_DIR}"
+            "${TensorFlowGEMMLowpSource_DIR}"
+            "${TensorFlowEigenSource_DIR}"
+            "${TensorFlowSource_DIR}"
+    )
 
     target_link_libraries(${TGT} PRIVATE luci_interpreter_mcu_pal)
     #target_link_libraries(${TGT} PRIVATE Threads::Threads luci_interpreter_mcu_pal)
diff --git a/compiler/luci-interpreter/src/BuddyMemoryManager.cpp b/compiler/luci-interpreter/src/BuddyMemoryManager.cpp
new file mode 100644
index 000000000..6ad1f320c
--- /dev/null
+++ b/compiler/luci-interpreter/src/BuddyMemoryManager.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/BuddyMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+BuddyMemoryManager::BuddyMemoryManager(uint8_t *memory_start, int32_t memSize)
+{
+  int32_t p = lowerLog2(memSize);
+
+  // We assume that the requested size of memory does not exceed 4 GB
+  assert(p < 32);
+  memSize = 1 << p;
+
+  _start_block = reinterpret_cast<Block *>(memory_start);
+  _start_block->size = memSize - sizeof(Block);
+  _start_block->is_free = true;
+  _start_block->self = _start_block;
+  _num_blocks = 0;
+  _size = _start_block->size;
+
+  for (auto &_free_block : _free_blocks)
+    _free_block = nullptr;
+
+  addToBlocks(_start_block, p);
+}
+
+void BuddyMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  const size_t element_size = getDataTypeSize(tensor.element_type());
+  const int32_t num_elements = tensor.shape().num_elements();
+  auto size = num_elements * element_size;
+  auto footprint = size + sizeof(Block);
+  auto l = (footprint & (footprint - 1)) == 0
+             ? lowerLog2(footprint)
+             : lowerLog2(footprint) + 1; // check footprint is pow_of_2
+
+  while (l < 32 && !_free_blocks[l])
+    l++;
+
+  assert(l < 32);
+
+  Block *tmp;
+  tmp = _free_blocks[l];
+  removeFromBlocks(tmp, l);
+
+  while ((tmp->size + sizeof(Block)) / 2 >= size + sizeof(Block))
+  {
+    divideBlock(tmp, l);
+    l--;
+  }
+
+  tmp->is_free = false;
+  tmp->self = tmp;
+  _num_blocks++;
+
+  auto *data = (uint8_t *)(tmp + 1);
+  tensor.set_data_buffer(data);
+}
+
+void BuddyMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  auto data = tensor.data<void>();
+  auto *tmp = (Block *)((uint8_t *)data - sizeof(Block));
+
+  assert(tmp->self == tmp);
+
+  tmp->is_free = true;
+  addToBlocks(tmp, lowerLog2(tmp->size + sizeof(Block)));
+
+  while (tmp)
+    if (tmp->size == _size)
+      break;
+    else
+      tmp = mergeBlock(tmp);
+
+  _num_blocks--;
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/BuddyMemoryManager.test.cpp b/compiler/luci-interpreter/src/BuddyMemoryManager.test.cpp
new file mode 100644
index 000000000..29fb767b7
--- /dev/null
+++ b/compiler/luci-interpreter/src/BuddyMemoryManager.test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/BuddyMemoryManager.h"
+#include <gtest/gtest.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(BuddyMemoryManager, basic)
+{
+  auto mem_pool = std::make_unique<uint8_t[]>(200);
+  auto buddy_memory_manager = std::make_unique<BuddyMemoryManager>(mem_pool.get(), 130);
+  Tensor first_tensor(DataType::U8, Shape({8}), AffineQuantization{}, "first_tensor");
+
+  buddy_memory_manager->allocate_memory(first_tensor);
+
+  uint8_t data_1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  first_tensor.writeData(data_1, 8);
+  uint8_t array_1[8];
+  first_tensor.readData(array_1, 8);
+  for (int i = 0; i < 8; i++)
+  {
+    EXPECT_EQ(data_1[i], array_1[i]);
+  }
+
+  Tensor second_tensor(DataType::U8, Shape({2, 5}), AffineQuantization{}, "second_tensor");
+  buddy_memory_manager->allocate_memory(second_tensor);
+
+  uint8_t data_2[2][5] = {{11, 22, 33, 44, 55}, {12, 23, 34, 45, 56}};
+  second_tensor.writeData(data_2, 10);
+
+  uint8_t array_2[2][5];
+  second_tensor.readData(array_2, 10);
+  for (int i = 0; i < 2; i++)
+  {
+    for (int j = 0; j < 5; j++)
+    {
+      EXPECT_EQ(data_2[i][j], array_2[i][j]);
+    }
+  }
+
+  buddy_memory_manager->release_memory(first_tensor);
+  EXPECT_EQ(first_tensor.data<void>(), nullptr);
+
+  buddy_memory_manager->release_memory(second_tensor);
+  EXPECT_EQ(second_tensor.data<void>(), nullptr);
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/CMakeLists.txt b/compiler/luci-interpreter/src/CMakeLists.txt
index 6f34b6117..e37150336 100644
--- a/compiler/luci-interpreter/src/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/CMakeLists.txt
@@ -1,13 +1,19 @@
-include(${LUCI_INTERPRETER_PAL_DIR}/pal.cmake)
+include("${LUCI_INTERPRETER_PAL_DIR}/pal.cmake")
 
 initialize_pal()
 
 if (NOT PAL_INITIALIZED)
+  message("PAL Failed to initialize, skip luci-interpreter")
   return()
 endif()
 
 message(STATUS "LUCI INTERPRETER BEGIN")
 
+set(LUCI_INTERPRETER_BINARY "luci_interpreter${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_CORE "luci_interpreter_core${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_KERNELS "luci_interpreter_kernels${LUCI_INTERPRETER_SUFFIX}")
+set(LUCI_INTERPRETER_LOADER "luci_interpreter_loader${LUCI_INTERPRETER_SUFFIX}")
+
 add_subdirectory(core)
 message(STATUS "LUCI INTERPRETER CORE")
 add_subdirectory(kernels)
@@ -19,15 +25,34 @@ message(STATUS "LUCI INTERPTER INITALIZED")
 
 set(SOURCES
     "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/Interpreter.h"
-    Interpreter.cpp)
-
-add_library(luci_interpreter SHARED ${SOURCES})
-target_include_directories(luci_interpreter PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
-target_include_directories(luci_interpreter PRIVATE "${LUCI_INTERPRETER_SOURCE_DIR}")
-target_link_libraries(luci_interpreter
-    PUBLIC luci_lang luci_interpreter_loader luci_interpreter_core
+    Interpreter.cpp "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/SimpleMemoryManager.h" SimpleMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/TestMemoryManager.h" TestMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/BuddyMemoryManager.h" BuddyMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/StaticMemoryManager.h" StaticMemoryManager.cpp)
+
+if (NOT LUCI_INTERPRETER_STATIC)
+  add_library(${LUCI_INTERPRETER_BINARY} SHARED ${SOURCES})
+else ()
+  add_library(${LUCI_INTERPRETER_BINARY} STATIC ${SOURCES})
+endif ()
+
+set(TEST_SOURCES BuddyMemoryManager.test.cpp)
+
+target_include_directories(${LUCI_INTERPRETER_BINARY} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
+target_include_directories(${LUCI_INTERPRETER_BINARY} PRIVATE "${LUCI_INTERPRETER_SOURCE_DIR}")
+target_link_libraries(${LUCI_INTERPRETER_BINARY}
+    PUBLIC luci_lang ${LUCI_INTERPRETER_LOADER} ${LUCI_INTERPRETER_CORE}
     PRIVATE nncc_common)
 
-install(TARGETS luci_interpreter DESTINATION lib)
+install(TARGETS ${LUCI_INTERPRETER_BINARY} DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include
         FILES_MATCHING PATTERN "*.h")
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(buddy_manager_test ${TEST_SOURCES})
+target_link_libraries(buddy_manager_test ${LUCI_INTERPRETER_BINARY})
diff --git a/compiler/luci-interpreter/src/Interpreter.cpp b/compiler/luci-interpreter/src/Interpreter.cpp
index b57b691d0..1b8792a6c 100644
--- a/compiler/luci-interpreter/src/Interpreter.cpp
+++ b/compiler/luci-interpreter/src/Interpreter.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci_interpreter/Interpreter.h"
+#include "luci_interpreter/SimpleMemoryManager.h"
 
 #include "loader/ModuleLoader.h"
 
@@ -69,12 +70,25 @@ private:
 
 } // namespace
 
-Interpreter::Interpreter(const luci::Module *module)
+Interpreter::Interpreter(const luci::Module *module,
+                         luci_interpreter::IMemoryManager *memory_manager)
 {
   _runtime_to_ir = std::make_unique<RuntimeToIR>();
   _event_notifier = std::make_unique<EventNotifierImpl>(*_runtime_to_ir, _observers);
   _runtime_module = std::make_unique<RuntimeModule>(_event_notifier.get());
-  ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor);
+
+  if (memory_manager == nullptr)
+  {
+    _default_memory_manager = std::make_unique<SimpleMemoryManager>();
+    _memory_manager = _default_memory_manager.get();
+  }
+  else
+  {
+    _memory_manager = memory_manager;
+  }
+
+  ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor,
+                      _memory_manager);
   loader.load();
 }
 
diff --git a/compiler/luci-interpreter/src/SimpleMemoryManager.cpp b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp
new file mode 100644
index 000000000..230e39896
--- /dev/null
+++ b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/SimpleMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void SimpleMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  if (tensor.is_data_allocated())
+  {
+    release_memory(tensor);
+  }
+  const auto element_size = getDataTypeSize(tensor.element_type());
+  const auto num_elements = tensor.shape().num_elements();
+
+  auto *data = new uint8_t[num_elements * element_size];
+  tensor.set_data_buffer(data);
+}
+
+void SimpleMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_data_allocated())
+  {
+    tensor.set_data_buffer(nullptr);
+    return;
+  }
+  auto data = tensor.data<uint8_t>();
+  delete[] data;
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/StaticMemoryManager.cpp b/compiler/luci-interpreter/src/StaticMemoryManager.cpp
new file mode 100644
index 000000000..73a819919
--- /dev/null
+++ b/compiler/luci-interpreter/src/StaticMemoryManager.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/StaticMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void StaticMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  int32_t offset = tensor.get_offset();
+  assert(offset >= 0);
+  auto tensor_ptr = _buffer_ptr + offset;
+  tensor.set_data_buffer(tensor_ptr);
+}
+
+void StaticMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/TestMemoryManager.cpp b/compiler/luci-interpreter/src/TestMemoryManager.cpp
new file mode 100644
index 000000000..3beeee55c
--- /dev/null
+++ b/compiler/luci-interpreter/src/TestMemoryManager.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+
+void TestMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
+{
+  if (!tensor.is_allocatable())
+  {
+    return;
+  }
+  if (tensor.is_data_allocated())
+  {
+    release_memory(tensor);
+  }
+  const auto element_size = getDataTypeSize(tensor.element_type());
+  const auto num_elements = tensor.shape().num_elements();
+
+  auto *data = new uint8_t[num_elements * element_size];
+  allocations.push_back(data);
+  tensor.set_data_buffer(data);
+}
+
+void TestMemoryManager::release_memory(luci_interpreter::Tensor &tensor)
+{
+  tensor.set_data_buffer(nullptr);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/core/CMakeLists.txt b/compiler/luci-interpreter/src/core/CMakeLists.txt
index e576dbd94..4430cba11 100644
--- a/compiler/luci-interpreter/src/core/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/core/CMakeLists.txt
@@ -9,9 +9,9 @@ set(SOURCES
     RuntimeModule.h
     Tensor.cpp)
 
-add_library(luci_interpreter_core STATIC ${SOURCES})
-set_target_properties(luci_interpreter_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(luci_interpreter_core PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
-target_include_directories(luci_interpreter_core PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
-target_link_libraries(luci_interpreter_core PUBLIC luci_lang)
-target_link_libraries(luci_interpreter_core PRIVATE nncc_common)
+add_library(${LUCI_INTERPRETER_CORE} STATIC ${SOURCES})
+set_target_properties(${LUCI_INTERPRETER_CORE} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}")
+target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
+target_link_libraries(${LUCI_INTERPRETER_CORE} PUBLIC luci_lang)
+target_link_libraries(${LUCI_INTERPRETER_CORE} PRIVATE nncc_common)
diff --git a/compiler/luci-interpreter/src/core/Kernel.h b/compiler/luci-interpreter/src/core/Kernel.h
index 5cdb2e360..a7c4a4218 100644
--- a/compiler/luci-interpreter/src/core/Kernel.h
+++ b/compiler/luci-interpreter/src/core/Kernel.h
@@ -36,8 +36,8 @@ protected:
 public:
   virtual ~Kernel() = default;
 
-  std::vector<const Tensor *> getInputTensors() const { return _inputs; }
-  std::vector<Tensor *> getOutputTensors() const { return _outputs; }
+  const std::vector<const Tensor *> &getInputTensors() const { return _inputs; }
+  const std::vector<Tensor *> &getOutputTensors() const { return _outputs; }
 
   // Configures the kernel.
   // This function is currently called once for each kernel during interpreter construction,
diff --git a/compiler/luci-interpreter/src/core/RuntimeGraph.cpp b/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
index fb0ad304b..c2f8d2ea8 100644
--- a/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
+++ b/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
@@ -29,8 +29,10 @@ class RuntimeGraph::TensorAllocPlan
   std::vector<std::vector<Tensor *>> _alloc_plan;
   std::vector<std::vector<Tensor *>> _dealloc_plan;
   bool _valid = false;
+  IMemoryManager *_memory_manager;
 
 public:
+  explicit TensorAllocPlan(IMemoryManager *memory_manager);
   void invalidate() { _valid = false; }
   bool isValid() const { return _valid; }
   void build(const RuntimeGraph &graph);
@@ -38,6 +40,11 @@ public:
   void deallocate(size_t kernel_index) const;
 };
 
+RuntimeGraph::TensorAllocPlan::TensorAllocPlan(IMemoryManager *memory_manager)
+  : _memory_manager(memory_manager)
+{
+}
+
 void RuntimeGraph::TensorAllocPlan::build(const RuntimeGraph &graph)
 {
   invalidate();
@@ -80,7 +87,7 @@ void RuntimeGraph::TensorAllocPlan::allocate(size_t kernel_index) const
   assert(_valid && kernel_index < _alloc_plan.size());
   for (Tensor *tensor : _alloc_plan[kernel_index])
   {
-    tensor->allocate();
+    _memory_manager->allocate_memory(*tensor);
   }
 }
 
@@ -89,16 +96,24 @@ void RuntimeGraph::TensorAllocPlan::deallocate(size_t kernel_index) const
   assert(_valid && kernel_index < _dealloc_plan.size());
   for (Tensor *tensor : _dealloc_plan[kernel_index])
   {
-    tensor->deallocate();
+    _memory_manager->release_memory(*tensor);
   }
 }
 
-RuntimeGraph::RuntimeGraph(RuntimeModule *owning_module)
-  : _owning_module(owning_module), _tensor_alloc_plan(std::make_unique<TensorAllocPlan>())
+RuntimeGraph::RuntimeGraph(RuntimeModule *owning_module, IMemoryManager *memory_manager)
+  : _owning_module(owning_module), _memory_manager(memory_manager),
+    _tensor_alloc_plan(std::make_unique<TensorAllocPlan>(memory_manager))
 {
 }
 
-RuntimeGraph::~RuntimeGraph() {}
+RuntimeGraph::~RuntimeGraph()
+{
+  for (auto &tensor : _tensors)
+  {
+    if (tensor->is_data_allocated())
+      _memory_manager->release_memory(*tensor);
+  }
+}
 
 Tensor *RuntimeGraph::addTensor(std::unique_ptr<Tensor> &&tensor)
 {
@@ -121,6 +136,11 @@ void RuntimeGraph::setOutputTensors(const std::vector<Tensor *> &output_tensors)
   _output_tensors = output_tensors;
 }
 
+void RuntimeGraph::configureAllocations(Tensor *tensor)
+{
+  _memory_manager->allocate_memory(*tensor);
+}
+
 void RuntimeGraph::addKernel(std::unique_ptr<Kernel> &&kernel)
 {
   assert(kernel != nullptr);
@@ -140,7 +160,8 @@ void RuntimeGraph::execute() const
   {
     for (const Tensor *input_tensor : getInputTensors())
     {
-      event_notifier->postTensorWrite(input_tensor);
+      if (input_tensor->is_observable())
+        event_notifier->postTensorWrite(input_tensor);
     }
   }
 
@@ -155,11 +176,10 @@ void RuntimeGraph::execute() const
     // TODO The `configure` method should only be called if the outputs of an operator need to be
     //  resized.
     kernel->configure();
-// TODO decide where to allocate memory, and uncomment/remove this if
-#if 0
-    _tensor_alloc_plan->allocate(
-        index); // Preallocate outputs in advance instead of relying on automatic allocation
-#endif
+
+    // Preallocate outputs in advance instead of relying on automatic allocation
+    _tensor_alloc_plan->allocate(index);
+
     kernel->execute();
 
     if (event_notifier != nullptr)
@@ -169,7 +189,7 @@ void RuntimeGraph::execute() const
 
     for (const Tensor *tensor : kernel->getOutputTensors())
     {
-      if (event_notifier != nullptr)
+      if (event_notifier != nullptr && tensor->is_observable())
       {
         event_notifier->postTensorWrite(tensor);
       }
diff --git a/compiler/luci-interpreter/src/core/RuntimeGraph.h b/compiler/luci-interpreter/src/core/RuntimeGraph.h
index 5f732025d..8184e249d 100644
--- a/compiler/luci-interpreter/src/core/RuntimeGraph.h
+++ b/compiler/luci-interpreter/src/core/RuntimeGraph.h
@@ -18,6 +18,7 @@
 #define LUCI_INTERPRETER_CORE_RUNTIMEGRAPH_H
 
 #include "luci_interpreter/core/Tensor.h"
+#include "luci_interpreter/MemoryManager.h"
 #include "core/Kernel.h"
 
 #include <memory>
@@ -35,7 +36,7 @@ private:
   friend class TensorAllocPlan;
 
 public:
-  explicit RuntimeGraph(RuntimeModule *owning_module);
+  explicit RuntimeGraph(RuntimeModule *owning_module, IMemoryManager *memory_manager);
   ~RuntimeGraph();
 
   Tensor *addTensor(std::unique_ptr<Tensor> &&tensor);
@@ -43,6 +44,8 @@ public:
   void setInputTensors(const std::vector<Tensor *> &input_tensors);
   void setOutputTensors(const std::vector<Tensor *> &output_tensors);
 
+  void configureAllocations(Tensor *tensor);
+
   const std::vector<Tensor *> &getInputTensors() const { return _input_tensors; }
   const std::vector<Tensor *> &getOutputTensors() const { return _output_tensors; }
 
@@ -51,6 +54,7 @@ public:
   void execute() const;
 
 private:
+  IMemoryManager *_memory_manager;
   RuntimeModule *_owning_module;
   std::vector<std::unique_ptr<Tensor>> _tensors;
   std::vector<Tensor *> _input_tensors;
diff --git a/compiler/luci-interpreter/src/core/RuntimeModule.h b/compiler/luci-interpreter/src/core/RuntimeModule.h
index dccc3a173..78873b0ec 100644
--- a/compiler/luci-interpreter/src/core/RuntimeModule.h
+++ b/compiler/luci-interpreter/src/core/RuntimeModule.h
@@ -19,6 +19,7 @@
 
 #include "core/RuntimeGraph.h"
 #include "core/EventNotifier.h"
+#include "luci_interpreter/MemoryManager.h"
 
 #include <memory>
 #include <vector>
@@ -33,9 +34,9 @@ public:
 
   EventNotifier *getEventNotifier() const { return _event_notifier; }
 
-  RuntimeGraph *addGraph()
+  RuntimeGraph *addGraph(IMemoryManager *memory_manager)
   {
-    _graphs.push_back(std::make_unique<RuntimeGraph>(this));
+    _graphs.push_back(std::make_unique<RuntimeGraph>(this, memory_manager));
     return _graphs.back().get();
   }
 
diff --git a/compiler/luci-interpreter/src/core/Tensor.cpp b/compiler/luci-interpreter/src/core/Tensor.cpp
index a9e7be0a9..3c3c5ffff 100644
--- a/compiler/luci-interpreter/src/core/Tensor.cpp
+++ b/compiler/luci-interpreter/src/core/Tensor.cpp
@@ -29,21 +29,6 @@ Tensor::Tensor(DataType element_type, Shape shape, AffineQuantization quantizati
 {
 }
 
-void Tensor::allocate()
-{
-  deallocate();
-  const size_t element_size = getDataTypeSize(_element_type);
-  const int32_t num_elements = _shape.num_elements();
-  _data = std::make_unique<uint8_t[]>(num_elements * element_size);
-  _data_allocated = true;
-}
-
-void Tensor::deallocate()
-{
-  _data_allocated = false;
-  _data.reset();
-}
-
 void Tensor::readData(void *data_ptr, size_t data_size) const
 {
   const size_t element_size = getDataTypeSize(element_type());
@@ -68,10 +53,6 @@ void Tensor::writeData(const void *data_ptr, size_t data_size)
   std::memcpy(data<void>(), data_ptr, data_size);
 }
 
-void Tensor::resize(const Shape &new_shape)
-{
-  deallocate();
-  _shape = new_shape;
-}
+void Tensor::resize(const Shape &new_shape) { _shape = new_shape; }
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-interpreter/src/kernels/Add.test.cpp
index 5ad9beb30..847b65667 100644
--- a/compiler/luci-interpreter/src/kernels/Add.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Add.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,6 +28,14 @@ namespace
 
 using namespace testing;
 
+class AddTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
 // for quantized Add, the error shouldn't exceed step
 float GetTolerance(float min, float max)
 {
@@ -34,7 +43,7 @@ float GetTolerance(float min, float max)
   return kQuantizedStep;
 }
 
-TEST(AddTest, Uint8)
+TEST_F(AddTest, Uint8)
 {
   std::initializer_list<int32_t> base_shape = {2, 3, 1, 2};
   std::initializer_list<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
@@ -57,10 +66,10 @@ TEST(AddTest, Uint8)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
   for (int i = 0; i < output_data.size(); i++)
   {
-    Tensor input1_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
-    Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
-                                                         quant_param.second, test_data);
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
     Tensor output_tensor =
       makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -69,6 +78,7 @@ TEST(AddTest, Uint8)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -78,10 +88,10 @@ TEST(AddTest, Uint8)
   // Re-run with exchanged inputs.
   for (int i = 0; i < output_data.size(); i++)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
-                                                         quant_param.second, test_data);
-    Tensor input2_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
     Tensor output_tensor =
       makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -90,6 +100,7 @@ TEST(AddTest, Uint8)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -98,7 +109,7 @@ TEST(AddTest, Uint8)
   }
 }
 
-TEST(AddTest, Float)
+TEST_F(AddTest, Float)
 {
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
@@ -116,8 +127,10 @@ TEST(AddTest, Float)
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
-    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
     AddParams params{};
@@ -125,6 +138,7 @@ TEST(AddTest, Float)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
@@ -133,8 +147,10 @@ TEST(AddTest, Float)
   // Re-run with exchanged inputs.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
-    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
     AddParams params{};
@@ -142,6 +158,7 @@ TEST(AddTest, Float)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
@@ -149,7 +166,7 @@ TEST(AddTest, Float)
   }
 }
 
-TEST(AddTest, SInt16)
+TEST_F(AddTest, SInt16)
 {
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
@@ -171,9 +188,10 @@ TEST(AddTest, SInt16)
 
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data);
-    Tensor input2_tensor =
-      makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
     const float tolerance = output_tensor.scale();
 
@@ -182,6 +200,7 @@ TEST(AddTest, SInt16)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorShape(output_tensor),
@@ -193,9 +212,10 @@ TEST(AddTest, SInt16)
   // Re-run with exchanged inputs and different scales.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor =
-      makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
-    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data);
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::S16, 5.0 / 32767, 0);
     const float tolerance = output_tensor.scale();
 
@@ -204,6 +224,7 @@ TEST(AddTest, SInt16)
 
     Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorShape(output_tensor),
@@ -214,10 +235,10 @@ TEST(AddTest, SInt16)
   }
 }
 
-TEST(AddTest, Input_Output_Type_NEG)
+TEST_F(AddTest, Input_Output_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   AddParams params{};
@@ -227,10 +248,10 @@ TEST(AddTest, Input_Output_Type_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(AddTest, Invalid_Input_Type_NEG)
+TEST_F(AddTest, Invalid_Input_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
-  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   AddParams params{};
@@ -238,6 +259,7 @@ TEST(AddTest, Invalid_Input_Type_NEG)
 
   Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.cpp
index 2437d5762..6561a1783 100644
--- a/compiler/luci-interpreter/src/kernels/ArgMax.cpp
+++ b/compiler/luci-interpreter/src/kernels/ArgMax.cpp
@@ -16,7 +16,7 @@
 
 #include "kernels/ArgMax.h"
 #include "kernels/Utils.h"
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALArgMax.h"
 
 namespace luci_interpreter
 {
@@ -60,10 +60,10 @@ void ArgMax::configure()
 void ArgMax::execute() const
 {
 
-#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                                        \
-  tflite::optimized_ops::ArgMinMax(                                                               \
-    getTensorShape(input()), getTensorData<data_type>(input()), getTensorData<axis_type>(axis()), \
-    getTensorShape(output()), getTensorData<output_type>(output()), std::greater<data_type>())
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                                    \
+  luci_interpreter_pal::ArgMinMax(getTensorShape(input()), getTensorData<data_type>(input()), \
+                                  getTensorData<axis_type>(axis()), getTensorShape(output()), \
+                                  getTensorData<output_type>(output()), std::greater<data_type>())
   if (axis()->element_type() == DataType::S32)
   {
     switch (_params.output_type)
diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
index 3362edbf6..119c69ccf 100644
--- a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/ArgMax.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -32,15 +33,19 @@ void Check(std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> output_shape, std::initializer_list<T1> input_data,
            std::initializer_list<int32_t> dimension_data, std::initializer_list<T2> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T1>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
-  Tensor dimension_tensor = makeInputTensor<DataType::S32>(dimension_shape, dimension_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor dimension_tensor =
+    makeInputTensor<DataType::S32>(dimension_shape, dimension_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(getElementType<T2>());
 
   ArgMaxParams params{};
   params.output_type = getElementType<T2>();
   ArgMax kernel(&input_tensor, &dimension_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -94,17 +99,21 @@ TYPED_TEST(ArgMaxTest, MultiDimensions)
 
 TEST(ArgMaxTest, UnsupportedType_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4}, {
-                                                                           1, 2, 7, 8, //
-                                                                           1, 9, 7, 3, //
-                                                                         });
-  Tensor dimension_tensor = makeInputTensor<DataType::S32>({}, {3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4},
+                                                           {
+                                                             1, 2, 7, 8, //
+                                                             1, 9, 7, 3, //
+                                                           },
+                                                           memory_manager.get());
+  Tensor dimension_tensor = makeInputTensor<DataType::S32>({}, {3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   ArgMaxParams params{};
   params.output_type = DataType::U8;
   ArgMax kernel(&input_tensor, &dimension_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
index 65ea4c09e..5545fb4d4 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
@@ -70,6 +70,11 @@ void AveragePool2D::configure()
     LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
     LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
   }
+  else if (input()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point());
+  }
   output()->resize({batches, output_height, output_width, depth});
 }
 
@@ -86,6 +91,9 @@ void AveragePool2D::execute() const
     case DataType::S16:
       evalSInt16();
       break;
+    case DataType::S8:
+      evalSInt8();
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
@@ -132,6 +140,26 @@ void AveragePool2D::evalQuantized() const
                                      getTensorData<uint8_t>(output()));
 }
 
+void AveragePool2D::evalSInt8() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+  tflite::PoolParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.filter_height = _params.filter_height;
+  params.filter_width = _params.filter_width;
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  tflite::reference_integer_ops::AveragePool(
+    params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(output()),
+    getTensorData<int8_t>(output()));
+}
+
 void AveragePool2D::evalSInt16() const
 {
   int32_t activation_min{};
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.h b/compiler/luci-interpreter/src/kernels/AveragePool2D.h
index 282a58797..b98367f31 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.h
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.h
@@ -40,6 +40,7 @@ private:
   void evalFloat() const;
   void evalQuantized() const;
   void evalSInt16() const;
+  void evalSInt8() const;
 
 private:
   int32_t _padding_height{};
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
index 4d7dab86a..7ed421129 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/AveragePool2D.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(AveragePool2DTest, Float)
+class AveragePool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(AveragePool2DTest, Float)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
@@ -34,7 +43,8 @@ TEST(AveragePool2DTest, Float)
     1,  2,  3,  4,  5,  //
     6,  7,  8,  9,  10, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -47,6 +57,7 @@ TEST(AveragePool2DTest, Float)
 
   AveragePool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -57,15 +68,15 @@ TEST(AveragePool2DTest, Float)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
 }
 
-TEST(AveragePool2DTest, Uint8_0)
+TEST_F(AveragePool2DTest, Uint8_0)
 {
   std::vector<float> input_data{
     0,  -6, 12, 4, //
     -3, -2, 10, 7, //
   };
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -78,13 +89,14 @@ TEST(AveragePool2DTest, Uint8_0)
 
   AveragePool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0.0, 6.0}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
 }
 
-TEST(AveragePool2DTest, Uint8_1)
+TEST_F(AveragePool2DTest, Uint8_1)
 {
   std::vector<float> input_data{
     0, 6, 12, 4, //
@@ -92,8 +104,8 @@ TEST(AveragePool2DTest, Uint8_1)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -106,13 +118,14 @@ TEST(AveragePool2DTest, Uint8_1)
 
   AveragePool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({2.75, 6.0}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 1}));
 }
 
-TEST(AveragePool2DTest, SInt16)
+TEST_F(AveragePool2DTest, SInt16)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
@@ -125,7 +138,8 @@ TEST(AveragePool2DTest, SInt16)
     0, 1.5, //
     4.5, 6, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
   Pool2DParams params{};
@@ -138,13 +152,47 @@ TEST(AveragePool2DTest, SInt16)
 
   AveragePool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(AveragePool2DTest, Invalid_Input_Shape_NEG)
+TEST_F(AveragePool2DTest, SInt8)
+{
+  Shape input_shape{1, 4, 5, 1};
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
+  std::vector<float> input_data{-7, -3, 0,  2, -5, 12, -15, 3,  10, 5,
+                                7,  -6, -1, 9, -2, 0,  -5,  11, -1, -7};
+  std::vector<float> ref_output_data{
+    0, 2.5, //
+    1, 1.5, //
+  };
+
+  std::pair<float, int32_t> quant_param = quantizationParams<int8_t>(-15.9375f, 15.9375f);
+  Tensor input_tensor = makeInputTensor<DataType::S8>(
+    input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::S8, quant_param.first, quant_param.second);
+
+  Pool2DParams params{};
+  params.padding = Padding::VALID;
+  params.filter_height = 2;
+  params.filter_width = 3;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.activation = Activation::RELU6;
+
+  AveragePool2D kernel(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST_F(AveragePool2DTest, Invalid_Input_Shape_NEG)
 {
   Shape input_shape{1, 3, 5};
   std::vector<float> input_data{
@@ -152,7 +200,8 @@ TEST(AveragePool2DTest, Invalid_Input_Shape_NEG)
     1,  2,  3,  4,  5,  //
     6,  7,  8,  9,  10, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -167,7 +216,7 @@ TEST(AveragePool2DTest, Invalid_Input_Shape_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(AveragePool2DTest, In_Out_Type_NEG)
+TEST_F(AveragePool2DTest, In_Out_Type_NEG)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
@@ -175,7 +224,8 @@ TEST(AveragePool2DTest, In_Out_Type_NEG)
     1,  2,  3,  4,  5,  //
     6,  7,  8,  9,  10, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Pool2DParams params{};
@@ -190,7 +240,7 @@ TEST(AveragePool2DTest, In_Out_Type_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(AveragePool2DTest, Quant_Param_NEG)
+TEST_F(AveragePool2DTest, Quant_Param_NEG)
 {
   std::vector<float> input_data{
     0,  -6, 12, 4, //
@@ -199,8 +249,8 @@ TEST(AveragePool2DTest, Quant_Param_NEG)
 
   std::pair<float, int32_t> quant_param1 = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
   std::pair<float, int32_t> quant_param2 = quantizationParams<uint8_t>(-7.875f, 7.875f);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param1.first,
-                                                      quant_param1.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param1.first, quant_param1.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param2.first, quant_param2.second);
 
   Pool2DParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp
index 591fcc00a..bd315ff7b 100644
--- a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp
+++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp
@@ -18,7 +18,7 @@
 #include "kernels/BatchToSpaceND.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALBatchToSpaceND.h"
 
 #include <stdexcept>
 
@@ -83,13 +83,13 @@ void BatchToSpaceND::execute() const
   switch (input()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::BatchToSpaceND(
+      luci_interpreter_pal::BatchToSpaceND(
         getTensorShape(input()), getTensorData<float>(input()), getTensorShape(block_shape()),
         getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
         getTensorData<int32_t>(crops()), getTensorShape(output()), getTensorData<float>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::BatchToSpaceND(
+      luci_interpreter_pal::BatchToSpaceND(
         getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(block_shape()),
         getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
         getTensorData<int32_t>(crops()), getTensorShape(output()),
diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
index a29981d17..f3a344974 100644
--- a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/BatchToSpaceND.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -33,14 +34,19 @@ void Check(std::initializer_list<int32_t> input_shape,
            std::initializer_list<T> input_data, std::initializer_list<int32_t> block_shape_data,
            std::initializer_list<int32_t> crops_data, std::initializer_list<T> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
-  Tensor crops_tensor = makeInputTensor<DataType::S32>(crops_shape, crops_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor crops_tensor =
+    makeInputTensor<DataType::S32>(crops_shape, crops_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -65,10 +71,11 @@ TYPED_TEST(BatchToSpaceNDTest, Simple)
 
 TEST(BatchToSpaceNDTest, Invalid_Shape_NEG)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({3, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
-  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {3, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
@@ -77,10 +84,11 @@ TEST(BatchToSpaceNDTest, Invalid_Shape_NEG)
 
 TEST(BatchToSpaceNDTest, Invalid_Crops_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
-    {4, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
-  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, -1, 0});
+    {4, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, -1, 0}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index 9801e11af..1b7d0f66a 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -1,158 +1,27 @@
-find_package(Threads REQUIRED)
-
 set(SOURCES
-    Add.h
-    Add.cpp
-    ArgMax.h
-    ArgMax.cpp
-    AveragePool2D.h
-    AveragePool2D.cpp
-    BatchToSpaceND.h
-    BatchToSpaceND.cpp
-    Cast.h
-    Cast.cpp
-    Concatenation.h
-    Concatenation.cpp
-    Conv2D.h
-    Conv2D.cpp
-    DepthToSpace.h
-    DepthToSpace.cpp
-    DepthwiseConv2D.h
-    DepthwiseConv2D.cpp
-    Div.h
-    Div.cpp
-    Elu.h
-    Elu.cpp
-    Exp.h
-    Exp.cpp
-    Floor.h
-    Floor.cpp
-    FloorDiv.h
-    FloorDiv.cpp
-    Equal.h
-    Equal.cpp
-    FullyConnected.h
-    FullyConnected.cpp
-    Greater.h
-    Greater.cpp
-    GreaterEqual.h
-    GreaterEqual.cpp
-    If.h
-    If.cpp
-    InstanceNorm.h
-    InstanceNorm.cpp
-    L2Normalize.h
-    L2Normalize.cpp
-    L2Pool2D.h
-    L2Pool2D.cpp
-    LeakyRelu.h
-    LeakyRelu.cpp
-    Less.h
-    Less.cpp
-    LessEqual.h
-    LessEqual.cpp
-    LocalResponseNormalization.h
-    LocalResponseNormalization.cpp
-    LogicalAnd.h
-    LogicalAnd.cpp
-    LogicalNot.h
-    LogicalNot.cpp
-    LogicalOr.h
-    LogicalOr.cpp
-    Logistic.h
-    Logistic.cpp
-    LogSoftmax.h
-    LogSoftmax.cpp
-    Maximum.h
-    Maximum.cpp
-    MaxPool2D.h
-    MaxPool2D.cpp
-    Mean.h
-    Mean.cpp
-    Minimum.h
-    Minimum.cpp
-    MirrorPad.h
-    MirrorPad.cpp
-    Mul.h
-    Mul.cpp
-    Neg.h
-    Neg.cpp
-    NotEqual.h
-    NotEqual.cpp
-    Pack.h
-    Pack.cpp
-    Pad.h
-    Pad.cpp
-    PadV2.h
-    PadV2.cpp
-    Pow.h
-    Pow.cpp
-    PRelu.h
-    PRelu.cpp
-    Relu.h
-    Relu.cpp
-    Relu6.h
-    Relu6.cpp
-    Reshape.h
-    Reshape.cpp
-    ResizeBilinear.h
-    ResizeBilinear.cpp
-    ResizeNearestNeighbor.h
-    ResizeNearestNeighbor.cpp
-    ReverseV2.h
-    ReverseV2.cpp
-    Rsqrt.h
-    Rsqrt.cpp
-    Slice.h
-    Slice.cpp
-    Softmax.h
-    Softmax.cpp
-    SpaceToBatchND.h
-    SpaceToBatchND.cpp
-    SpaceToDepth.h
-    SpaceToDepth.cpp
-    Split.h
-    Split.cpp
-    StridedSlice.h
-    StridedSlice.cpp
-    Sqrt.h
-    Sqrt.cpp
-    Square.h
-    Square.cpp
-    SquaredDifference.h
-    SquaredDifference.cpp
-    Squeeze.h
-    Squeeze.cpp
-    Sub.h
-    Sub.cpp
-    Tanh.h
-    Tanh.cpp
-    Transpose.h
-    Transpose.cpp
-    TransposeConv.h
-    TransposeConv.cpp
-    Unpack.h
-    Unpack.cpp
-    While.h
-    While.cpp)
+        BinaryOpCommon.h
+        Utils.h
+        Utils.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/TestMemoryManager.h"
+        ${LUCI_INTERPRETER_SOURCE_DIR}/TestMemoryManager.cpp
+        "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/SimpleMemoryManager.h"
+        ${LUCI_INTERPRETER_SOURCE_DIR}/SimpleMemoryManager.cpp)
+
+macro(REGISTER_KERNEL NODE)
+  list(APPEND SOURCES "${NODE}.h")
+  list(APPEND SOURCES "${NODE}.cpp")
+endmacro(REGISTER_KERNEL)
+
+include(${KERNEL_REGISTER_FILE})
 
-list(APPEND SOURCES
-    BinaryOpCommon.h
-    Utils.h
-    Utils.cpp
-    ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc)
+add_library(${LUCI_INTERPRETER_KERNELS} STATIC ${SOURCES})
+set_target_properties(${LUCI_INTERPRETER_KERNELS} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_SOURCE_DIR})
 
-add_library(luci_interpreter_kernels STATIC ${SOURCES})
-set_target_properties(luci_interpreter_kernels PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(luci_interpreter_kernels PUBLIC ${LUCI_INTERPRETER_SOURCE_DIR})
-target_include_directories(luci_interpreter_kernels SYSTEM PRIVATE
-    "${TensorFlowRuySource_DIR}"
-    "${TensorFlowGEMMLowpSource_DIR}"
-    "${TensorFlowEigenSource_DIR}"
-    "${TensorFlowSource_DIR}")
-target_link_libraries(luci_interpreter_kernels
-    PUBLIC luci_interpreter_core
-    PRIVATE nncc_common Threads::Threads)
+target_link_libraries(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_CORE})
+target_link_libraries(${LUCI_INTERPRETER_KERNELS} PRIVATE nncc_common)
+
+add_pal_to_target(${LUCI_INTERPRETER_KERNELS})
 
 if(NOT ENABLE_TEST)
   return()
@@ -160,75 +29,13 @@ endif(NOT ENABLE_TEST)
 
 nnas_find_package(GTest REQUIRED)
 
-set(TEST_SOURCES
-    Add.test.cpp
-    ArgMax.test.cpp
-    AveragePool2D.test.cpp
-    BatchToSpaceND.test.cpp
-    Cast.test.cpp
-    Concatenation.test.cpp
-    Conv2D.test.cpp
-    DepthToSpace.test.cpp
-    DepthwiseConv2D.test.cpp
-    Div.test.cpp
-    Elu.test.cpp
-    Exp.test.cpp
-    Floor.test.cpp
-    FloorDiv.test.cpp
-    Equal.test.cpp
-    FullyConnected.test.cpp
-    Greater.test.cpp
-    GreaterEqual.test.cpp
-    If.test.cpp
-    InstanceNorm.test.cpp
-    L2Normalize.test.cpp
-    L2Pool2D.test.cpp
-    LeakyRelu.test.cpp
-    Less.test.cpp
-    LessEqual.test.cpp
-    LocalResponseNormalization.test.cpp
-    LogicalAnd.test.cpp
-    LogicalNot.test.cpp
-    LogicalOr.test.cpp
-    Logistic.test.cpp
-    LogSoftmax.test.cpp
-    Maximum.test.cpp
-    MaxPool2D.test.cpp
-    Mean.test.cpp
-    Minimum.test.cpp
-    Mul.test.cpp
-    Neg.test.cpp
-    NotEqual.test.cpp
-    Pack.test.cpp
-    Pad.test.cpp
-    PadV2.test.cpp
-    Pow.test.cpp
-    PRelu.test.cpp
-    Relu.test.cpp
-    Relu6.test.cpp
-    Reshape.test.cpp
-    ResizeBilinear.test.cpp
-    ResizeNearestNeighbor.test.cpp
-    ReverseV2.test.cpp
-    Rsqrt.test.cpp
-    Slice.test.cpp
-    Softmax.test.cpp
-    SpaceToBatchND.test.cpp
-    SpaceToDepth.test.cpp
-    Split.test.cpp
-    StridedSlice.test.cpp
-    Sqrt.test.cpp
-    Square.test.cpp
-    SquaredDifference.test.cpp
-    Squeeze.test.cpp
-    Sub.test.cpp
-    Tanh.test.cpp
-    Transpose.test.cpp
-    TransposeConv.test.cpp
-    Unpack.test.cpp
-    While.test.cpp)
+macro(REGISTER_KERNEL NODE)
+  list(APPEND TEST_SOURCES "${NODE}.test.cpp")
+endmacro(REGISTER_KERNEL)
+
+include(${KERNEL_REGISTER_FILE})
 
 list(APPEND TEST_SOURCES TestUtils.h TestUtils.cpp)
 
-GTest_AddTest(luci_interpreter_kernels_test ${TEST_SOURCES})
-target_link_libraries(luci_interpreter_kernels_test luci_interpreter_kernels)
+GTest_AddTest(${LUCI_INTERPRETER_KERNELS}_test ${TEST_SOURCES})
+target_link_libraries(${LUCI_INTERPRETER_KERNELS}_test ${LUCI_INTERPRETER_KERNELS})
diff --git a/compiler/luci-interpreter/src/kernels/Cast.test.cpp b/compiler/luci-interpreter/src/kernels/Cast.test.cpp
index 42944628d..731260522 100644
--- a/compiler/luci-interpreter/src/kernels/Cast.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Cast.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Cast.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,59 +31,209 @@ template <typename T1, typename T2>
 void Check(std::initializer_list<int32_t> shape, std::initializer_list<T1> input_data,
            std::initializer_list<T2> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType input_type = getElementType<T1>();
   constexpr DataType output_type = getElementType<T2>();
 
-  Tensor input_tensor = makeInputTensor<input_type>(shape, input_data);
+  Tensor input_tensor = makeInputTensor<input_type>(shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(output_type);
 
   Cast kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), shape);
 }
 
+template <typename T>
+void CheckBoolTo(std::initializer_list<int32_t> shape, std::initializer_list<bool> input_data,
+                 std::initializer_list<T> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType input_type = loco::DataType::BOOL;
+  constexpr DataType output_type = getElementType<T>();
+  std::vector<typename DataTypeImpl<input_type>::Type> input_data_converted;
+  for (auto elem : input_data)
+  {
+    input_data_converted.push_back(elem);
+  }
+
+  Tensor input_tensor =
+    makeInputTensor<input_type>(shape, input_data_converted, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(output_type);
+
+  Cast kernel(&input_tensor, &output_tensor);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), shape);
+}
+
 template <typename T> class CastTest : public ::testing::Test
 {
 };
 
-using DataTypes = ::testing::Types<uint8_t, int32_t, int64_t>;
-TYPED_TEST_CASE(CastTest, DataTypes);
+using IntDataTypes =
+  ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t>;
+TYPED_TEST_CASE(CastTest, IntDataTypes);
 
 TYPED_TEST(CastTest, FloatToInt)
 {
   Check<float, TypeParam>(/*shape=*/{1, 1, 1, 4},
                           /*input_data=*/
                           {
-                            1.43f, 9.99f, 7.0f, 3.12f, //
+                            1.0f, 9.0f, 7.0f, 3.0f, //
                           },
                           /*output_data=*/
                           {
                             1, 9, 7, 3, //
                           });
-  Check<TypeParam, TypeParam>(/*shape=*/{1, 1, 1, 4},
-                              /*input_data=*/
-                              {
-                                1, 9, 7, 3, //
-                              },
-                              /*output_data=*/
-                              {
-                                1, 9, 7, 3, //
-                              });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToFloat)
+{
+  Check<TypeParam, float>(/*shape=*/{1, 1, 1, 4},
+                          /*input_data=*/
+                          {
+                            1, 9, 7, 3, //
+                          },
+                          /*output_data=*/
+                          {
+                            1.0f, 9.0f, 7.0f, 3.0f, //
+                          });
+  SUCCEED();
+}
+
+template <typename T1, typename T2> void check_int()
+{
+  Check<T1, T2>(/*shape=*/{1, 1, 1, 4},
+                /*input_data=*/
+                {
+                  1, 9, 7, 3, //
+                },
+                /*output_data=*/
+                {
+                  1, 9, 7, 3, //
+                });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToInt)
+{
+  check_int<TypeParam, uint8_t>();
+  check_int<TypeParam, uint16_t>();
+  check_int<TypeParam, uint32_t>();
+  check_int<TypeParam, uint64_t>();
+  check_int<TypeParam, int8_t>();
+  check_int<TypeParam, int16_t>();
+  check_int<TypeParam, int32_t>();
+  check_int<TypeParam, int64_t>();
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, IntToBool)
+{
+  Check<TypeParam, bool>(/*shape=*/{1, 1, 1, 4},
+                         /*input_data=*/
+                         {
+                           1, 0, 7, 0, //
+                         },
+                         /*output_data=*/
+                         {
+                           true, false, true, false, //
+                         });
+  SUCCEED();
+}
+
+TYPED_TEST(CastTest, BoolToInt)
+{
+  CheckBoolTo<TypeParam>(/*shape=*/{1, 1, 1, 4},
+                         /*input_data=*/
+                         {
+                           true, false, false, true, //
+                         },
+                         /*output_data=*/
+                         {
+                           1, 0, 0, 1, //
+                         });
+  SUCCEED();
+}
+
+TEST(CastTest, FloatToBool)
+{
+  Check<float, bool>(/*shape=*/{1, 1, 1, 4},
+                     /*input_data=*/
+                     {
+                       1.0f, 0.0f, 7.0f, 0.0f, //
+                     },
+                     /*output_data=*/
+                     {
+                       true, false, true, false, //
+                     });
+  SUCCEED();
+}
+
+TEST(CastTest, BoolToFloat)
+{
+  CheckBoolTo<float>(/*shape=*/{1, 1, 1, 4},
+                     /*input_data=*/
+                     {
+                       true, false, false, true, //
+                     },
+                     /*output_data=*/
+                     {
+                       1.0f, 0.0f, 0.0f, 1.0f, //
+                     });
+  SUCCEED();
+}
+
+TEST(CastTest, FloatToFloat)
+{
+  Check<float, float>(/*shape=*/{1, 1, 1, 4},
+                      /*input_data=*/
+                      {
+                        1.0f, 0.0f, 7.0f, 0.0f, //
+                      },
+                      /*output_data=*/
+                      {
+                        1.0f, 0.0f, 7.0f, 0.0f, //
+                      });
+  SUCCEED();
+}
+
+TEST(CastTest, BoolToBool)
+{
+  CheckBoolTo<bool>(/*shape=*/{1, 1, 1, 4},
+                    /*input_data=*/
+                    {
+                      true, true, false, false, //
+                    },
+                    /*output_data=*/
+                    {
+                      true, true, false, false, //
+                    });
+  SUCCEED();
 }
 
 TEST(CastTest, UnsupportedType_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4}, {
-                                                                           1, 2, 7, 8, //
-                                                                           1, 9, 7, 3, //
-                                                                         });
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4},
+                                                           {
+                                                             1, 2, 7, 8, //
+                                                             1, 9, 7, 3, //
+                                                           },
+                                                           memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::Unknown);
 
   Cast kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
+  SUCCEED();
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
index e3376c13d..7cfdf34b9 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
@@ -18,7 +18,7 @@
 #include "kernels/Concatenation.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/concatenation.h>
 
 #include <stdexcept>
 
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
index ee9b7d0d3..e4b50611a 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Concatenation.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,12 +27,22 @@ namespace
 
 using namespace testing;
 
-TEST(ConcatenationTest, Float)
+class ConcatenationTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ConcatenationTest, Float)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
@@ -42,6 +53,10 @@ TEST(ConcatenationTest, Float)
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
+    for (auto t : kernel.getOutputTensors())
+    {
+      _memory_manager->allocate_memory(*t);
+    }
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -53,6 +68,7 @@ TEST(ConcatenationTest, Float)
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -64,6 +80,7 @@ TEST(ConcatenationTest, Float)
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -75,6 +92,7 @@ TEST(ConcatenationTest, Float)
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -82,7 +100,7 @@ TEST(ConcatenationTest, Float)
   }
 }
 
-TEST(ConcatenationTest, Input_Number_Check_NEG)
+TEST_F(ConcatenationTest, Input_Number_Check_NEG)
 {
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
@@ -94,12 +112,14 @@ TEST(ConcatenationTest, Input_Number_Check_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ConcatenationTest, Invalid_Axis_NEG)
+TEST_F(ConcatenationTest, Invalid_Axis_NEG)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
@@ -110,12 +130,13 @@ TEST(ConcatenationTest, Invalid_Axis_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ConcatenationTest, Mismatching_Input_Type_NEG)
+TEST_F(ConcatenationTest, Mismatching_Input_Type_NEG)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<uint8_t> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::U8>({2, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U8>({2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
@@ -126,12 +147,14 @@ TEST(ConcatenationTest, Mismatching_Input_Type_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
+TEST_F(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
@@ -142,12 +165,14 @@ TEST(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ConcatenationTest, Mismatching_Input_Dimension_NEG)
+TEST_F(ConcatenationTest, Mismatching_Input_Dimension_NEG)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<float> input2_data{7, 8, 9, 10, 11, 12, 13, 14, 15};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({3, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
@@ -158,12 +183,12 @@ TEST(ConcatenationTest, Mismatching_Input_Dimension_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ConcatenationTest, Unsupported_Configure_Type_NEG)
+TEST_F(ConcatenationTest, Unsupported_Configure_Type_NEG)
 {
   std::vector<int8_t> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<int8_t> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::S8>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 3}, input2_data);
+  Tensor input1_tensor = makeInputTensor<DataType::S8>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S8);
   ConcatenationParams params{};
 
@@ -175,12 +200,14 @@ TEST(ConcatenationTest, Unsupported_Configure_Type_NEG)
 }
 
 // TODO: Remove this test when concat w/ fused_activation is supported
-TEST(ConcatenationTest, With_Fused_Activation_NEG)
+TEST_F(ConcatenationTest, With_Fused_Activation_NEG)
 {
   std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
   std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   ConcatenationParams params{};
 
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
index 56ca96a34..fb5e063a9 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+#include "PALConv2d.h"
 
 #include <stdexcept>
 #include <thread>
@@ -30,8 +30,8 @@ namespace kernels
 {
 
 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
-               const Conv2DParams &params)
-  : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
+               Tensor *im2col, const Conv2DParams &params)
+  : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, im2col}, params)
 {
 }
 
@@ -45,7 +45,7 @@ void Conv2D::configure()
   // (3) | uint8 uint8  int32 uint8  | quantized
   // (4) | int8  int8   int32 int8   | quantized per channel
   //
-  // We only support (1) and (3) for now, and additionally the following:
+  // We only support (1), (3) and (4) for now, and additionally the following:
   //     | input filter bias  output |
   // ----+---------------------------+
   // (5) | int16 int16  int64 int16  |
@@ -58,6 +58,17 @@ void Conv2D::configure()
   {
     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
   }
+  else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+    LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+    LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                           static_cast<size_t>(filter()->shape().dim(0)));
+    for (auto zerop : filter()->zero_points())
+    {
+      LUCI_INTERPRETER_CHECK(zerop == 0);
+    }
+  }
   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
   {
     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
@@ -103,23 +114,20 @@ void Conv2D::configure()
     _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
   const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
                                        filter_height != 1 || filter_width != 1;
-  const bool need_im2col =
+  _need_im2col =
     input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
-  if (need_im2col)
+  if (_need_im2col)
   {
     const int input_depth = input_shape.dim(3);
     Shape im2col_shape{batches, output_height, output_width,
                        input_depth * filter_height * filter_width};
-    try
-    {
-      _im2col =
-        std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
-    }
-    catch (std::bad_alloc &ba)
-    {
-      // Failed memory allocation
-      _im2col = nullptr;
-    }
+    auto im2col = getOutputTensors()[1];
+    im2col->resize(im2col_shape);
+  }
+  else
+  {
+    auto im2col = getOutputTensors()[1];
+    im2col->set_allocatable(false);
   }
 }
 
@@ -147,14 +155,15 @@ void Conv2D::execute() const
         evalQuantizedPerChannel();
       }
       break;
+    case DataType::S8:
+      evalQuantizedS8PerChannel();
+      break;
     case DataType::S16:
       evalQuantizedS16();
       break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
-  if (!!_im2col)
-    _im2col->deallocate();
 }
 
 void Conv2D::evalFloat() const
@@ -173,32 +182,16 @@ void Conv2D::evalFloat() const
   params.float_activation_min = activation_min;
   params.float_activation_max = activation_max;
 
-  if (_im2col)
+  float *im2col_data = nullptr;
+  auto im2col = getOutputTensors()[1];
+  if (_need_im2col)
   {
-    try
-    {
-      tflite::optimized_ops::Conv(
-        params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
-        getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
-        getTensorShape(output()), getTensorData<float>(output()), getTensorShape(_im2col.get()),
-        getTensorData<float>(_im2col.get()));
-    }
-    catch (std::bad_alloc &ba)
-    {
-      // Failed memory allocation
-      _im2col->deallocate();
-
-      tflite::reference_ops::Conv(
-        params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
-        getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
-        getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
-    }
+    im2col_data = im2col->data<float>();
   }
-  else
-    tflite::reference_ops::Conv(
-      params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
-      getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
-      getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
+  luci_interpreter_pal::Conv(
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+    getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()), getTensorShape(im2col), im2col_data);
 }
 
 void Conv2D::evalQuantized() const
@@ -232,16 +225,12 @@ void Conv2D::evalQuantized() const
   params.quantized_activation_min = activation_min;
   params.quantized_activation_max = activation_max;
 
-  // TODO This should only be done once (although it takes only a few microseconds).
-  //  Also, the user should be able to adjust the number of threads.
-  auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
-  gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
-
-  tflite::optimized_ops::Conv(
-    params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
-    getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
-    getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
-    getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
+  auto im2col = getOutputTensors()[1];
+  luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                             getTensorShape(filter()), getTensorData<uint8_t>(filter()),
+                             getTensorShape(bias()), getTensorData<int32_t>(bias()),
+                             getTensorShape(output()), getTensorData<uint8_t>(output()),
+                             getTensorShape(im2col), getTensorData<uint8_t>(im2col));
 }
 
 void Conv2D::evalQuantizedPerChannel() const
@@ -330,6 +319,54 @@ void Conv2D::evalQuantizedPerChannel() const
   }
 }
 
+void Conv2D::evalQuantizedS8PerChannel() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  // The kernel expects filter zero points to be negated.
+  params.input_offset = -input()->zero_point(); // Note the '-'.
+  params.weights_offset = 0;                    // Unused in tflite code
+  params.output_offset = output()->zero_point();
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  int8_t *im2col_data = nullptr;
+  auto im2col = getOutputTensors()[1];
+  if (_need_im2col)
+  {
+    im2col_data = im2col->data<int8_t>();
+  }
+
+  luci_interpreter_pal::ConvPerChannel(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorShape(im2col), im2col_data);
+}
+
 void Conv2D::evalQuantizedS16() const
 {
   const auto *input_data = getTensorData<int16_t>(input());
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-interpreter/src/kernels/Conv2D.h
index 86f73c251..5f1317638 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.h
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.h
@@ -31,7 +31,7 @@ class Conv2D : public KernelWithParams<Conv2DParams>
 {
 public:
   Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
-         const Conv2DParams &params);
+         Tensor *im2col, const Conv2DParams &params);
 
   const Tensor *input() const { return _inputs[0]; }
   const Tensor *filter() const { return _inputs[1]; }
@@ -45,10 +45,11 @@ private:
   void evalFloat() const;
   void evalQuantized() const;
   void evalQuantizedPerChannel() const;
+  void evalQuantizedS8PerChannel() const;
   void evalQuantizedS16() const;
 
 private:
-  std::unique_ptr<Tensor> _im2col;
+  bool _need_im2col = false;
   int32_t _padding_height{};
   int32_t _padding_width{};
 };
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
index 8610a4fe6..277c280f5 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Conv2D.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(Conv2DTest, Float)
+class Conv2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(Conv2DTest, Float)
 {
   Shape input_shape{1, 4, 3, 2};
   Shape filter_shape{2, 2, 2, 2};
@@ -44,9 +53,13 @@ TEST(Conv2DTest, Float)
     -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -57,8 +70,10 @@ TEST(Conv2DTest, Float)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(im2col);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -70,7 +85,55 @@ TEST(Conv2DTest, Float)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(Conv2DTest, FloatCheck)
+TEST_F(Conv2DTest, FloatPointwise)
+{
+  Shape input_shape{1, 2, 2, 2};
+  Shape filter_shape{2, 1, 1, 2};
+  Shape bias_shape{2};
+  std::vector<float> input_data{
+    1, 2, // row = 0, col = 0
+    3, 4, // row = 0, col = 1
+    5, 6, // row = 1, col = 0
+    7, 8, // row = 1, col = 1
+  };
+  std::vector<float> filter_data{
+    -1, 2, // out = 0
+    -3, 4, // out = 1
+  };
+  std::vector<float> bias_data{1, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::RELU;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(im2col);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    4, 7,  6,  9,  // row = 0
+    8, 11, 10, 13, // row = 1
+  };
+  std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, FloatCheck)
 {
   Shape input_shape{2, 2, 4, 1};
   Shape filter_shape{3, 2, 2, 1};
@@ -89,9 +152,13 @@ TEST(Conv2DTest, FloatCheck)
     -1, -1, 1,  1, // third 2x2 filter
   };
   std::vector<float> bias_data{1, 2, 3};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -102,8 +169,10 @@ TEST(Conv2DTest, FloatCheck)
   params.dilation_width_factor = 1;
   params.activation = Activation::NONE;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -117,7 +186,7 @@ TEST(Conv2DTest, FloatCheck)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(Conv2DTest, Uint8)
+TEST_F(Conv2DTest, Uint8)
 {
   std::vector<float> input_data{
     // First batch
@@ -137,12 +206,15 @@ TEST(Conv2DTest, Uint8)
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::U8>({3, 2, 2, 1}, input_quant_param.first,
-                                                       input_quant_param.second, filter_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::U8>({3, 2, 2, 1}, input_quant_param.first, input_quant_param.second,
+                                  filter_data, _memory_manager.get());
   Tensor bias_tensor = makeInputTensor<DataType::S32>(
-    {3}, input_quant_param.first * input_quant_param.first, 0, bias_data);
+    {3}, input_quant_param.first * input_quant_param.first, 0, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::U8, Shape({}), {}, "");
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
@@ -154,8 +226,10 @@ TEST(Conv2DTest, Uint8)
   params.dilation_width_factor = 1;
   params.activation = Activation::NONE;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -169,7 +243,7 @@ TEST(Conv2DTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(Conv2DTest, Uint8_CWQ)
+TEST_F(Conv2DTest, Uint8_CWQ)
 {
   const int output_channels = 3;
   std::vector<float> input_data{
@@ -209,12 +283,14 @@ TEST(Conv2DTest, Uint8_CWQ)
     bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
   std::vector<int32_t> zerop(output_channels, 0);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor filter_tensor =
-    makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 0, filter_data);
-  Tensor bias_tensor =
-    makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops,
+                                                       0, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, _memory_manager.get());
+  Tensor im2col(DataType::U8, Shape({}), {}, "");
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
@@ -226,8 +302,10 @@ TEST(Conv2DTest, Uint8_CWQ)
   params.dilation_width_factor = 1;
   params.activation = Activation::NONE;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -241,7 +319,83 @@ TEST(Conv2DTest, Uint8_CWQ)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(Conv2DTest, SInt16)
+TEST_F(Conv2DTest, SInt8_CWQ)
+{
+  const int output_channels = 3;
+  std::vector<float> input_data{
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
+  };
+  std::vector<float> filter_data{
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
+  };
+  std::vector<float> bias_data{1, 2, 3};
+  Shape filter_shape{output_channels, 2, 2, 1};
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(0, 4);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.5, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.25, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.125, 0));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>({2, 2, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S8>(filter_shape, filter_scales, filter_zerops,
+                                                       0, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, _memory_manager.get());
+  Tensor im2col(DataType::S8, Shape({}), {}, "");
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  Conv2DParams params{};
+  params.padding = Padding::VALID;
+  params.stride_height = 2;
+  params.stride_width = 2;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
+  };
+  std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(Conv2DTest, SInt16)
 {
   Shape input_shape{1, 4, 3, 2};
   Shape filter_shape{2, 2, 2, 2};
@@ -266,9 +420,13 @@ TEST(Conv2DTest, SInt16)
     0,  40, 0, 44, // row = 1
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::S16, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
   Conv2DParams params{};
@@ -279,15 +437,17 @@ TEST(Conv2DTest, SInt16)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(Conv2DTest, SInt16_CWQ_weights)
+TEST_F(Conv2DTest, SInt16_CWQ_weights)
 {
   Shape input_shape{1, 2, 2, 2};  // Batch x H x W x C
   Shape filter_shape{3, 1, 1, 2}; // Out channels x H x W x In Channels
@@ -321,10 +481,13 @@ TEST(Conv2DTest, SInt16_CWQ_weights)
     bias_scales.push_back(filter_scales[i] * input_scale);
   std::vector<int32_t> zerop = {0, 0, 0};
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
-  Tensor filter_tensor =
-    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0,
+                                                        filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor im2col(DataType::S16, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
 
   Conv2DParams params{};
@@ -335,15 +498,17 @@ TEST(Conv2DTest, SInt16_CWQ_weights)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  _memory_manager->allocate_memory(im2col);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(Conv2DTest, Unsupported_Type_Configure_NEG)
+TEST_F(Conv2DTest, Unsupported_Type_Configure_NEG)
 {
   Shape input_shape{1, 4, 3, 2};
   Shape filter_shape{2, 2, 2, 2};
@@ -361,9 +526,13 @@ TEST(Conv2DTest, Unsupported_Type_Configure_NEG)
     -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::S32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -374,11 +543,11 @@ TEST(Conv2DTest, Unsupported_Type_Configure_NEG)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(Conv2DTest, Invalid_Bias_Type_NEG)
+TEST_F(Conv2DTest, Invalid_Bias_Type_NEG)
 {
   Shape input_shape{1, 4, 3, 2};
   Shape filter_shape{2, 2, 2, 2};
@@ -396,9 +565,12 @@ TEST(Conv2DTest, Invalid_Bias_Type_NEG)
     -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<uint8_t> bias_data{1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::U8>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::U8>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -409,11 +581,11 @@ TEST(Conv2DTest, Invalid_Bias_Type_NEG)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(Conv2DTest, Invalid_Bias_Data_NEG)
+TEST_F(Conv2DTest, Invalid_Bias_Data_NEG)
 {
   Shape input_shape{1, 4, 3, 2};
   Shape filter_shape{2, 2, 2, 2};
@@ -431,9 +603,13 @@ TEST(Conv2DTest, Invalid_Bias_Data_NEG)
     -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2, 3};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -444,11 +620,11 @@ TEST(Conv2DTest, Invalid_Bias_Data_NEG)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(Conv2DTest, Invalid_Input_Shape_NEG)
+TEST_F(Conv2DTest, Invalid_Input_Shape_NEG)
 {
   Shape input_shape{1, 4, 6, 1};
   Shape filter_shape{2, 2, 2, 2};
@@ -466,9 +642,13 @@ TEST(Conv2DTest, Invalid_Input_Shape_NEG)
     -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
+  Tensor im2col(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Conv2DParams params{};
@@ -479,7 +659,7 @@ TEST(Conv2DTest, Invalid_Input_Shape_NEG)
   params.dilation_width_factor = 1;
   params.activation = Activation::RELU;
 
-  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  Conv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &im2col, params);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
index f2b9e4ccc..3a9acd1d4 100644
--- a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -16,7 +16,7 @@
 
 #include "DepthToSpace.h"
 #include "Utils.h"
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALDepthToSpace.h"
 
 namespace luci_interpreter
 {
@@ -62,14 +62,14 @@ void DepthToSpace::execute() const
   switch (input()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
-                                          getTensorData<float>(input()), getTensorShape(output()),
-                                          getTensorData<float>(output()));
+      luci_interpreter_pal::DepthToSpace(op_params, getTensorShape(input()),
+                                         getTensorData<float>(input()), getTensorShape(output()),
+                                         getTensorData<float>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
-                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
-                                          getTensorData<uint8_t>(output()));
+      luci_interpreter_pal::DepthToSpace(op_params, getTensorShape(input()),
+                                         getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                         getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported Type.");
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
index 3dee4ad36..9b1c09ba9 100644
--- a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/DepthToSpace.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -35,12 +36,14 @@ TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
 
 TYPED_TEST(DepthToSpaceTest, SimpleCase)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
   Shape input_shape{1, 1, 2, 4};
   std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
   std::vector<int32_t> output_shape{1, 2, 4, 1};
 
-  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
 
   DepthToSpaceParams params{};
@@ -48,6 +51,7 @@ TYPED_TEST(DepthToSpaceTest, SimpleCase)
 
   DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
@@ -57,10 +61,12 @@ TYPED_TEST(DepthToSpaceTest, SimpleCase)
 
 TEST(DepthToSpaceTest, InvalidInputShape_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
   Shape input_shape{1, 2, 4};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthToSpaceParams params{};
@@ -72,10 +78,12 @@ TEST(DepthToSpaceTest, InvalidInputShape_NEG)
 
 TEST(DepthToSpaceTest, InOutTypeMismatch_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
   Shape input_shape{1, 1, 2, 4};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   DepthToSpaceParams params{};
@@ -87,10 +95,12 @@ TEST(DepthToSpaceTest, InOutTypeMismatch_NEG)
 
 TEST(DepthToSpaceTest, InvalidBlockSize_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8};
   Shape input_shape{1, 1, 2, 4};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthToSpaceParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
index 1452f4421..f2dbf6c68 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
@@ -20,6 +20,7 @@
 
 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
 
 #include <stdexcept>
 
@@ -45,7 +46,7 @@ void DepthwiseConv2D::configure()
   // (4) | int8  int8   int32 int8   | quantized per channel
   // (5) | int16 int8   int64 int16  | quantized per channel 16x8
   //
-  // We only support (1) and (3) for now, and additionally the following:
+  // We only support (1), (3) and (4) for now, and additionally the following:
   //     | input filter bias  output |
   // ----+---------------------------+
   // (5) | int16 int16  int64 int16  |
@@ -58,6 +59,17 @@ void DepthwiseConv2D::configure()
   {
     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
   }
+  else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+    LUCI_INTERPRETER_CHECK(static_cast<uint32_t>(filter()->shape().dim(3)) ==
+                           filter()->scales().size());
+    for (auto zerop : filter()->zero_points())
+    {
+      LUCI_INTERPRETER_CHECK(zerop == 0);
+    }
+    LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+  }
   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
   {
     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
@@ -123,6 +135,9 @@ void DepthwiseConv2D::execute() const
         evalQuantizedPerChannel();
       }
       break;
+    case DataType::S8:
+      evalQuantizedS8PerChannel();
+      break;
     case DataType::S16:
       evalQuantizedS16();
       break;
@@ -283,6 +298,52 @@ void DepthwiseConv2D::evalQuantized() const
     getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
+void DepthwiseConv2D::evalQuantizedS8PerChannel() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::DepthwiseParams params{};
+
+  params.padding_type = tflite::PaddingType::kSame;
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  params.depth_multiplier = _params.depth_multiplier;
+  // The kernel expects input and filter zero points to be negated.
+  params.input_offset = -input()->zero_point(); // Note the '-'.
+  params.weights_offset = 0;
+  params.output_offset = output()->zero_point();
+  params.output_multiplier = 1; // unused in tflite code
+  params.output_shift = 0;      // unused in tflite code
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<int8_t>(output()));
+}
+
 void DepthwiseConv2D::evalQuantizedS16() const
 {
   const auto *input_data = getTensorData<int16_t>(input());
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
index 6d700dd0f..6cffd6583 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h
@@ -43,6 +43,7 @@ private:
   void evalFloat() const;
   void evalQuantized() const;
   void evalQuantizedPerChannel() const;
+  void evalQuantizedS8PerChannel() const;
   void evalQuantizedS16() const;
 
 private:
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
index 3e2f434dd..74975899a 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/DepthwiseConv2D.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(DepthwiseConv2DTest, Float)
+class DepthwiseConv2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(DepthwiseConv2DTest, Float)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{1, 2, 2, 4};
@@ -44,9 +53,12 @@ TEST(DepthwiseConv2DTest, Float)
     13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthwiseConv2DParams params{};
@@ -60,6 +72,7 @@ TEST(DepthwiseConv2DTest, Float)
 
   DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -70,7 +83,7 @@ TEST(DepthwiseConv2DTest, Float)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
 }
 
-TEST(DepthwiseConv2DTest, Uint8)
+TEST_F(DepthwiseConv2DTest, Uint8)
 {
   std::vector<float> input_data{
     1, 2, 7,  8,  // column 1
@@ -88,12 +101,14 @@ TEST(DepthwiseConv2DTest, Uint8)
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 3, 2, 2}, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 4}, input_quant_param.first,
-                                                       input_quant_param.second, filter_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 3, 2, 2}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 2, 4}, input_quant_param.first, input_quant_param.second,
+                                  filter_data, _memory_manager.get());
   Tensor bias_tensor = makeInputTensor<DataType::S32>(
-    {4}, input_quant_param.first * input_quant_param.first, 0, bias_data);
+    {4}, input_quant_param.first * input_quant_param.first, 0, bias_data, _memory_manager.get());
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
@@ -108,6 +123,7 @@ TEST(DepthwiseConv2DTest, Uint8)
 
   DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -118,7 +134,7 @@ TEST(DepthwiseConv2DTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
 }
 
-TEST(DepthwiseConv2DTest, SInt16)
+TEST_F(DepthwiseConv2DTest, SInt16)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{1, 2, 2, 4};
@@ -143,9 +159,12 @@ TEST(DepthwiseConv2DTest, SInt16)
     167, 0, 227, 28, //
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>(filter_shape, 0.2, 0, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
   DepthwiseConv2DParams params{};
@@ -159,13 +178,14 @@ TEST(DepthwiseConv2DTest, SInt16)
 
   DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
+TEST_F(DepthwiseConv2DTest, SInt16_CWQ_weights)
 {
   const int output_channels = 4;
   Shape input_shape{1, 4, 2, 2};
@@ -197,10 +217,12 @@ TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
   for (int i = 0; i < output_channels; ++i)
     bias_scales.push_back(filter_scales[i] * input_scale);
   std::vector<int32_t> zerop(4, 0);
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
-  Tensor filter_tensor =
-    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 3, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 3,
+                                                        filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
   DepthwiseConv2DParams params{};
@@ -214,13 +236,14 @@ TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
 
   DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
+TEST_F(DepthwiseConv2DTest, Uint8_CWQ_weights)
 {
   const int output_channels = 4;
   Shape input_shape{1, 3, 2, 2};
@@ -267,11 +290,13 @@ TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
     bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
   std::vector<int32_t> zerop(output_channels, 0);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor filter_tensor =
-    makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 3, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops,
+                                                       3, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
@@ -286,6 +311,7 @@ TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
 
   DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
@@ -293,7 +319,83 @@ TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
               FloatArrayNear(ref_output_data, output_quant_param.first));
 }
 
-TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
+TEST_F(DepthwiseConv2DTest, SInt8_CWQ_weights)
+{
+  const int output_channels = 4;
+  Shape input_shape{1, 3, 2, 2};
+  Shape filter_shape{1, 2, 2, output_channels};
+  Shape bias_shape{4};
+  std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
+
+  std::vector<float> input_data{
+    1, 2, 7,  8,  //
+    3, 4, 9,  10, //
+    5, 6, 11, 12, //
+  };
+  std::vector<float> filter_data{
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
+  };
+  std::vector<float> bias_data{1, 2, 3, 4};
+  std::vector<float> ref_output_data{
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
+  };
+
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-128, 127);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+
+  std::vector<std::pair<float, int32_t>> filter_quant_params;
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.5, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.25, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(1, 0));
+  filter_quant_params.push_back(std::pair<float, int32_t>(0.125, 0));
+
+  std::vector<float> filter_scales;
+  std::vector<int32_t> filter_zerops;
+  for (auto iter : filter_quant_params)
+  {
+    filter_scales.push_back(iter.first);
+    filter_zerops.push_back(iter.second);
+  }
+
+  std::vector<float> bias_scales;
+  for (int i = 0; i < output_channels; ++i)
+    bias_scales.push_back(filter_quant_params[i].first * input_quant_param.first);
+  std::vector<int32_t> zerop(output_channels, 0);
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S8>(filter_shape, filter_scales, filter_zerops,
+                                                       3, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      _memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  DepthwiseConv2DParams params{};
+  params.padding = Padding::VALID;
+  params.depth_multiplier = 2;
+  params.stride_height = 1;
+  params.stride_width = 1;
+  params.dilation_height_factor = 1;
+  params.dilation_width_factor = 1;
+  params.activation = Activation::NONE;
+
+  DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(ref_output_data, output_quant_param.first));
+}
+
+TEST_F(DepthwiseConv2DTest, InvalidBiasType_NEG)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{1, 2, 2, 4};
@@ -311,9 +413,11 @@ TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
     13, -14, 15,  -16, //
   };
   std::vector<int32_t> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthwiseConv2DParams params{};
@@ -329,7 +433,7 @@ TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
+TEST_F(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{1, 2, 2, 4};
@@ -347,9 +451,12 @@ TEST(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
     13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   DepthwiseConv2DParams params{};
@@ -365,7 +472,7 @@ TEST(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(DepthwiseConv2DTest, InvalidInputShape_NEG)
+TEST_F(DepthwiseConv2DTest, InvalidInputShape_NEG)
 {
   Shape input_shape{4, 2, 2};
   Shape filter_shape{2, 2, 4};
@@ -383,9 +490,12 @@ TEST(DepthwiseConv2DTest, InvalidInputShape_NEG)
     13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthwiseConv2DParams params{};
@@ -401,7 +511,7 @@ TEST(DepthwiseConv2DTest, InvalidInputShape_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(DepthwiseConv2DTest, InvalidFilterShape_NEG)
+TEST_F(DepthwiseConv2DTest, InvalidFilterShape_NEG)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{2, 1, 2, 4};
@@ -419,9 +529,12 @@ TEST(DepthwiseConv2DTest, InvalidFilterShape_NEG)
     13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthwiseConv2DParams params{};
@@ -437,7 +550,7 @@ TEST(DepthwiseConv2DTest, InvalidFilterShape_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(DepthwiseConv2DTest, InvalidBiasDim_NEG)
+TEST_F(DepthwiseConv2DTest, InvalidBiasDim_NEG)
 {
   Shape input_shape{1, 4, 2, 2};
   Shape filter_shape{1, 2, 4, 2};
@@ -455,9 +568,12 @@ TEST(DepthwiseConv2DTest, InvalidBiasDim_NEG)
     13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DepthwiseConv2DParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/Div.cpp b/compiler/luci-interpreter/src/kernels/Div.cpp
index db1496d37..0e52ba1f0 100644
--- a/compiler/luci-interpreter/src/kernels/Div.cpp
+++ b/compiler/luci-interpreter/src/kernels/Div.cpp
@@ -18,7 +18,8 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/div.h>
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/Div.test.cpp b/compiler/luci-interpreter/src/kernels/Div.test.cpp
index 1a0c4af15..021d68d06 100644
--- a/compiler/luci-interpreter/src/kernels/Div.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Div.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Div.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,6 +28,14 @@ namespace
 
 using namespace testing;
 
+class DivTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
 float GetTolerance(float min, float max)
 {
   const float kQuantizedStep = (max - min) / 255.0f;
@@ -34,7 +43,7 @@ float GetTolerance(float min, float max)
   return kQuantizedTolerance;
 }
 
-TEST(DivTest, Float)
+TEST_F(DivTest, Float)
 {
   Shape base_shape = {2, 3, 1, 1};
 
@@ -44,8 +53,10 @@ TEST(DivTest, Float)
   std::vector<float> input2_data{0.2f, 1.6f, 0.5f, 0.4f, 1.6f, 0.4f};
   std::vector<float> test_outputs{1.5f, 1.4375f, 1.8f, 1.25f, 0.5f, 2.75f};
 
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input2_data, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
@@ -54,13 +65,14 @@ TEST(DivTest, Float)
 
   Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
-TEST(DivTest, FloatBroadcast)
+TEST_F(DivTest, FloatBroadcast)
 {
   Shape input1_shape = {1, 3};
   Shape input2_shape = {3, 1};
@@ -69,8 +81,10 @@ TEST(DivTest, FloatBroadcast)
   std::vector<float> input2_data{0.2f, 1.6f, 0.5f};
   std::vector<float> test_outputs{0.f, 11.5f, 4.5f, 0.f, 1.4375f, 0.5625f, 0.f, 4.6f, 1.8f};
 
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
@@ -79,12 +93,13 @@ TEST(DivTest, FloatBroadcast)
 
   Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
 }
 
-TEST(DivTest, Uint8)
+TEST_F(DivTest, Uint8)
 {
   Shape base_shape = {1, 2, 2, 1};
 
@@ -98,10 +113,10 @@ TEST(DivTest, Uint8)
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.f, 1.f);
 
-  Tensor input1_tensor =
-    makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input1_data);
-  Tensor input2_tensor =
-    makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input2_data);
+  Tensor input1_tensor = makeInputTensor<DataType::U8>(
+    base_shape, quant_param.first, quant_param.second, input1_data, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::U8>(
+    base_shape, quant_param.first, quant_param.second, input2_data, _memory_manager.get());
 
   Tensor output_tensor =
     makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
@@ -111,6 +126,7 @@ TEST(DivTest, Uint8)
 
   Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -118,10 +134,10 @@ TEST(DivTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
 }
 
-TEST(DivTest, Input_Output_Type_NEG)
+TEST_F(DivTest, Input_Output_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   DivParams params{};
@@ -131,10 +147,10 @@ TEST(DivTest, Input_Output_Type_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(DivTest, Invalid_Input_Type_NEG)
+TEST_F(DivTest, Invalid_Input_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
-  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   DivParams params{};
@@ -142,6 +158,7 @@ TEST(DivTest, Invalid_Input_Type_NEG)
 
   Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Elu.cpp b/compiler/luci-interpreter/src/kernels/Elu.cpp
index 456396055..697d63be4 100644
--- a/compiler/luci-interpreter/src/kernels/Elu.cpp
+++ b/compiler/luci-interpreter/src/kernels/Elu.cpp
@@ -17,7 +17,7 @@
 #include "kernels/Elu.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALElu.h"
 
 #include <stdexcept>
 
@@ -40,8 +40,8 @@ void Elu::execute() const
   switch (input()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::Elu(getTensorShape(input()), getTensorData<float>(input()),
-                                 getTensorShape(output()), getTensorData<float>(output()));
+      luci_interpreter_pal::Elu(getTensorShape(input()), getTensorData<float>(input()),
+                                getTensorShape(output()), getTensorData<float>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/Elu.test.cpp b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
index e26eed03e..814499cdb 100644
--- a/compiler/luci-interpreter/src/kernels/Elu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Elu.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,11 +30,14 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Elu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   (void)output_shape;
@@ -58,12 +62,14 @@ TEST(EluTest, SimpleElu)
 
 TEST(EluTest, InOutTypeMismatch_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, -6, 2,  -4,   //
     3, -2, 10, -0.1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Elu kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/Equal.h b/compiler/luci-interpreter/src/kernels/Equal.h
index 69b3be774..11f025eac 100644
--- a/compiler/luci-interpreter/src/kernels/Equal.h
+++ b/compiler/luci-interpreter/src/kernels/Equal.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Equal.test.cpp b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
index ba2827ba9..46a0f97d8 100644
--- a/compiler/luci-interpreter/src/kernels/Equal.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Equal.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(EqualTest, FloatSimple)
+class EqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(EqualTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(EqualTest, FloatSimple)
     false, true, false, // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(EqualTest, FloatBroardcast)
+TEST_F(EqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -76,12 +86,13 @@ TEST(EqualTest, FloatBroardcast)
     true,  true,  true,  // Row 4
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -92,7 +103,7 @@ TEST(EqualTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(EqualTest, Uint8Quantized)
+TEST_F(EqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.5, 0.7,  0.9, // Row 1
@@ -110,24 +121,25 @@ TEST(EqualTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
 
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(EqualTest, Uint8QuantizedBroadcast)
+TEST_F(EqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -148,34 +160,35 @@ TEST(EqualTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 4, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(EqualTest, Input_Type_Mismatch_NEG)
+TEST_F(EqualTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(EqualTest, Input_Output_Type_NEG)
+TEST_F(EqualTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/Exp.cpp b/compiler/luci-interpreter/src/kernels/Exp.cpp
index f7b115ab3..e7c560a88 100644
--- a/compiler/luci-interpreter/src/kernels/Exp.cpp
+++ b/compiler/luci-interpreter/src/kernels/Exp.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/exp.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/Exp.test.cpp b/compiler/luci-interpreter/src/kernels/Exp.test.cpp
index 19b2c141a..a159d9db9 100644
--- a/compiler/luci-interpreter/src/kernels/Exp.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Exp.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Exp.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,13 +30,16 @@ using namespace testing;
 
 TEST(ExpTest, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   Shape input_shape{1, 1, 7};
   std::vector<float> input_data{0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Exp kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<int32_t> ref_output_shape{1, 1, 7};
diff --git a/compiler/luci-interpreter/src/kernels/Floor.test.cpp b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
index d90d611d9..30076fb54 100644
--- a/compiler/luci-interpreter/src/kernels/Floor.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Floor.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(FloorTest, SimpleFloat)
+class FloorTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(FloorTest, SimpleFloat)
 {
   std::initializer_list<int32_t> input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
@@ -40,20 +49,22 @@ TEST(FloorTest, SimpleFloat)
     3, 7, 10, -1, // Row 2
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Floor kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(FloorTest, Input_Output_Type_NEG)
+TEST_F(FloorTest, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   Floor kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
index 16831ca80..3e1b5f18e 100644
--- a/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/FloorDiv.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(FloorDivTest, FloatSimple)
+class FloorDivTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(FloorDivTest, FloatSimple)
 {
   Shape x_shape{2, 3};
   std::vector<float> x_data{
@@ -47,12 +56,13 @@ TEST(FloorDivTest, FloatSimple)
     1, 1, 1, // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -60,7 +70,7 @@ TEST(FloorDivTest, FloatSimple)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(FloorDivTest, FloatBroadcast)
+TEST_F(FloorDivTest, FloatBroadcast)
 {
   Shape x_shape{1, 3};
   std::vector<float> x_data{
@@ -81,12 +91,13 @@ TEST(FloorDivTest, FloatBroadcast)
     1, 3,  -4, // Row 3
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(y_shape, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -94,36 +105,37 @@ TEST(FloorDivTest, FloatBroadcast)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(FloorDivTest, DivByZero_NEG)
+TEST_F(FloorDivTest, DivByZero_NEG)
 {
   Shape shape{3};
   std::vector<float> x_data{1, 0, -1};
   std::vector<float> y_data{0, 0, 0};
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(shape, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(shape, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(shape, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>(shape, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
 
   EXPECT_ANY_THROW(kernel.execute());
 }
 
-TEST(FloorDivTest, Input_Output_Type_Mismatch_NEG)
+TEST_F(FloorDivTest, Input_Output_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(FloorDivTest, Input_Type_Mismatch_NEG)
+TEST_F(FloorDivTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FloorDiv kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
index 48433b42d..cfe8f8bf2 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
@@ -19,6 +19,7 @@
 #include "kernels/Utils.h"
 
 #include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h>
 
 #include <stdexcept>
 
@@ -48,6 +49,12 @@ void FullyConnected::configure()
     LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
     LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
   }
+  else if (weights()->element_type() == DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::S8);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::S8);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::S32)
+  }
   else
   {
     throw std::runtime_error("Unsupported type.");
@@ -77,6 +84,9 @@ void FullyConnected::execute() const
     case DataType::U8:
       evalQuantized();
       break;
+    case DataType::S8:
+      evalQuantizedS8();
+      break;
     case DataType::FLOAT32:
       evalFloat();
       break;
@@ -135,5 +145,38 @@ void FullyConnected::evalQuantized() const
     getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
+void FullyConnected::evalQuantizedS8() const
+{
+  double real_multiplier = 0.0;
+  int output_shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  int32_t output_multiplier;
+  real_multiplier =
+    getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
+  quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+  calculateActivationRangeQuantized(params().activation, output(), &output_activation_min,
+                                    &output_activation_max);
+
+  int32_t input_offset = -input()->zero_point();
+  int32_t filter_offset = -weights()->zero_point();
+  int32_t output_offset = output()->zero_point();
+
+  tflite::FullyConnectedParams op_params{};
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.lhs_cacheable = false;
+  op_params.rhs_cacheable = false;
+  tflite::reference_integer_ops::FullyConnected(
+    op_params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(weights()),
+    getTensorData<int8_t>(weights()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<int8_t>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.h b/compiler/luci-interpreter/src/kernels/FullyConnected.h
index 204f11ebb..2a7c068c0 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.h
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.h
@@ -42,6 +42,7 @@ public:
 private:
   void evalFloat() const;
   void evalQuantized() const;
+  void evalQuantizedS8() const;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
index 0259d3e1d..b0eda0145 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/FullyConnected.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -32,9 +33,13 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
            std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FullyConnectedParams params{};
@@ -42,6 +47,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
   FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -49,21 +55,63 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 }
 
 template <>
+void Check<int8_t>(std::initializer_list<int32_t> input_shape,
+                   std::initializer_list<int32_t> weights_shape,
+                   std::initializer_list<int32_t> bias_shape,
+                   std::initializer_list<int32_t> output_shape,
+                   std::initializer_list<float> input_data,
+                   std::initializer_list<float> weights_data,
+                   std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  const float quantized_tolerance = getTolerance(-127, 128, 255);
+  std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-63.5, 64);
+  std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-127, 128);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::S8>(weights_shape, input_quant_param.first, input_quant_param.second,
+                                  weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S32>(bias_shape, input_quant_param.first * input_quant_param.first, 0,
+                                   bias_data, memory_manager.get());
+  Tensor output_tensor =
+    makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+template <>
 void Check<uint8_t>(
   std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
   std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
   std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
   std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   const float quantized_tolerance = getTolerance(-127, 128, 255);
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(-127, 128);
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor weights_tensor = makeInputTensor<DataType::U8>(weights_shape, input_quant_param.first,
-                                                        input_quant_param.second, weights_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S32>(
-    bias_shape, input_quant_param.first * input_quant_param.first, 0, bias_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::U8>(weights_shape, input_quant_param.first, input_quant_param.second,
+                                  weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S32>(bias_shape, input_quant_param.first * input_quant_param.first, 0,
+                                   bias_data, memory_manager.get());
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
@@ -72,6 +120,7 @@ void Check<uint8_t>(
 
   FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -83,7 +132,7 @@ template <typename T> class FullyConnectedTest : public ::testing::Test
 {
 };
 
-using DataTypes = ::testing::Types<float, uint8_t>;
+using DataTypes = ::testing::Types<float, uint8_t, int8_t>;
 TYPED_TEST_CASE(FullyConnectedTest, DataTypes);
 
 TYPED_TEST(FullyConnectedTest, Simple)
@@ -121,9 +170,13 @@ TEST(FullyConnectedTest, InvalidBiasType_NEG)
   Shape bias_shape{3};
   std::vector<int32_t> bias_data{-1, -5, -8};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FullyConnectedParams params{};
@@ -149,9 +202,14 @@ TEST(FullyConnectedTest, InvalidWeightShapeDim_NEG)
   Shape bias_shape{3};
   std::vector<float> bias_data{-1, -5, -8};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FullyConnectedParams params{};
@@ -180,9 +238,14 @@ TEST(FullyConnectedTest, BiasElementNumWeightDimMismatch_NEG)
   Shape bias_shape{3};
   std::vector<float> bias_data{-1, -5, -8};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor weights_tensor = makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data);
-  Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor weights_tensor =
+    makeInputTensor<DataType::FLOAT32>(weights_shape, weights_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   FullyConnectedParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/Greater.h b/compiler/luci-interpreter/src/kernels/Greater.h
index a65d29f5c..877c139c9 100644
--- a/compiler/luci-interpreter/src/kernels/Greater.h
+++ b/compiler/luci-interpreter/src/kernels/Greater.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Greater.test.cpp b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
index 3fcc86603..ba3925f17 100644
--- a/compiler/luci-interpreter/src/kernels/Greater.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Greater.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(GreaterTest, FloatSimple)
+class GreaterTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GreaterTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(GreaterTest, FloatSimple)
     true,  false, false, // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(GreaterTest, FloatBroardcast)
+TEST_F(GreaterTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -74,12 +84,13 @@ TEST(GreaterTest, FloatBroardcast)
     false, false, true,  // Row 3
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -90,7 +101,7 @@ TEST(GreaterTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(GreaterTest, Uint8Quantized)
+TEST_F(GreaterTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -108,21 +119,22 @@ TEST(GreaterTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterTest, Uint8QuantizedRescale)
+TEST_F(GreaterTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -142,21 +154,22 @@ TEST(GreaterTest, Uint8QuantizedRescale)
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 3);
 
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterTest, Uint8QuantizedBroadcast)
+TEST_F(GreaterTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -175,34 +188,35 @@ TEST(GreaterTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterTest, Input_Type_Mismatch_NEG)
+TEST_F(GreaterTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(GreaterTest, Input_Output_Type_NEG)
+TEST_F(GreaterTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.h b/compiler/luci-interpreter/src/kernels/GreaterEqual.h
index e948d698f..4a0f48748 100644
--- a/compiler/luci-interpreter/src/kernels/GreaterEqual.h
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
index 7c79d8abc..a9d172301 100644
--- a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/GreaterEqual.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(GreaterEqualTest, FloatSimple)
+class GreaterEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GreaterEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(GreaterEqualTest, FloatSimple)
     true,  true, false, // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(GreaterEqualTest, FloatBroardcast)
+TEST_F(GreaterEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -74,12 +84,13 @@ TEST(GreaterEqualTest, FloatBroardcast)
     false, false, true,  // Row 3
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -90,7 +101,7 @@ TEST(GreaterEqualTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(GreaterEqualTest, Uint8Quantized)
+TEST_F(GreaterEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -108,21 +119,22 @@ TEST(GreaterEqualTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterEqualTest, Uint8QuantizedRescale)
+TEST_F(GreaterEqualTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
     0.5, 0.5, 0.7,  0.9, // Row 1
@@ -142,21 +154,22 @@ TEST(GreaterEqualTest, Uint8QuantizedRescale)
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterEqualTest, Uint8QuantizedBroadcast)
+TEST_F(GreaterEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -175,34 +188,35 @@ TEST(GreaterEqualTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(GreaterEqualTest, Input_Type_Mismatch_NEG)
+TEST_F(GreaterEqualTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(GreaterEqualTest, Input_Output_Type_NEG)
+TEST_F(GreaterEqualTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/If.cpp b/compiler/luci-interpreter/src/kernels/If.cpp
index a267f6267..971708bca 100644
--- a/compiler/luci-interpreter/src/kernels/If.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.cpp
@@ -68,6 +68,8 @@ void If::execute() const
 
     const int32_t num_elements = input(i)->shape().num_elements();
     const std::size_t element_size = getDataTypeSize(input(i)->element_type());
+    // TODO: Think about how allocate memory for output in main graph
+    active_graph->configureAllocations(graph_inputs[i]);
     std::memcpy(graph_inputs[i]->data<void>(), input(i)->data<void>(), num_elements * element_size);
   }
 
@@ -78,6 +80,8 @@ void If::execute() const
   {
     LUCI_INTERPRETER_CHECK(graph_outputs[i]->element_type() == output(i)->element_type());
     output(i)->resize(graph_outputs[i]->shape());
+    // TODO: Think about how allocate memory for output in main graph
+    active_graph->configureAllocations(output(i));
 
     const int32_t num_elements = output(i)->shape().num_elements();
     const std::size_t element_size = getDataTypeSize(output(i)->element_type());
diff --git a/compiler/luci-interpreter/src/kernels/If.test.cpp b/compiler/luci-interpreter/src/kernels/If.test.cpp
index 0dba310d9..c5f4faf75 100644
--- a/compiler/luci-interpreter/src/kernels/If.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.test.cpp
@@ -21,6 +21,8 @@
 #include "kernels/Mul.h"
 #include "kernels/TestUtils.h"
 
+#include "luci_interpreter/TestMemoryManager.h"
+
 namespace luci_interpreter
 {
 namespace kernels
@@ -30,9 +32,17 @@ namespace
 
 using namespace testing;
 
-RuntimeGraph *buildAddSubgraph(RuntimeModule *module)
+class IfTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+RuntimeGraph *buildAddSubgraph(RuntimeModule *module, IMemoryManager *memory_manager)
 {
-  RuntimeGraph *graph = module->addGraph();
+  RuntimeGraph *graph = module->addGraph(memory_manager);
   Tensor *input1 = graph->addTensor(
     std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *input2 = graph->addTensor(
@@ -40,6 +50,10 @@ RuntimeGraph *buildAddSubgraph(RuntimeModule *module)
   Tensor *output = graph->addTensor(
     std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
 
+  memory_manager->allocate_memory(*input1);
+  memory_manager->allocate_memory(*input2);
+  memory_manager->allocate_memory(*output);
+
   graph->setInputTensors({input1, input2});
   graph->setOutputTensors({output});
 
@@ -50,9 +64,9 @@ RuntimeGraph *buildAddSubgraph(RuntimeModule *module)
   return graph;
 }
 
-RuntimeGraph *buildMulSubgraph(RuntimeModule *module)
+RuntimeGraph *buildMulSubgraph(RuntimeModule *module, IMemoryManager *memory_manager)
 {
-  RuntimeGraph *graph = module->addGraph();
+  RuntimeGraph *graph = module->addGraph(memory_manager);
   Tensor *input1 = graph->addTensor(
     std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *input2 = graph->addTensor(
@@ -60,6 +74,10 @@ RuntimeGraph *buildMulSubgraph(RuntimeModule *module)
   Tensor *output = graph->addTensor(
     std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
 
+  memory_manager->allocate_memory(*input1);
+  memory_manager->allocate_memory(*input2);
+  memory_manager->allocate_memory(*output);
+
   graph->setInputTensors({input1, input2});
   graph->setOutputTensors({output});
 
@@ -70,67 +88,69 @@ RuntimeGraph *buildMulSubgraph(RuntimeModule *module)
   return graph;
 }
 
-TEST(IfTest, CondTrue)
+TEST_F(IfTest, CondTrue)
 {
-  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {true});
-  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
-  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {true}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
   Tensor output = makeOutputTensor(DataType::FLOAT32);
 
   RuntimeModule module(nullptr);
-  RuntimeGraph *then_graph = buildAddSubgraph(&module);
-  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
 
   If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
   kernel.configure();
+  _memory_manager->allocate_memory(output);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({6, 9}));
 }
 
-TEST(IfTest, CondFalse)
+TEST_F(IfTest, CondFalse)
 {
-  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {false});
-  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
-  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor cond = makeInputTensor<DataType::BOOL>({1}, {false}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
   Tensor output = makeOutputTensor(DataType::FLOAT32);
 
   RuntimeModule module(nullptr);
-  RuntimeGraph *then_graph = buildAddSubgraph(&module);
-  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
 
   If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
   kernel.configure();
+  _memory_manager->allocate_memory(output);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({5, 14}));
 }
 
-TEST(IfTest, InvalidCondType_NEG)
+TEST_F(IfTest, InvalidCondType_NEG)
 {
-  Tensor cond = makeInputTensor<DataType::FLOAT32>({1}, {1});
-  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
-  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor cond = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
   Tensor output = makeOutputTensor(DataType::FLOAT32);
 
   RuntimeModule module(nullptr);
-  RuntimeGraph *then_graph = buildAddSubgraph(&module);
-  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
 
   If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(IfTest, InvalidCondElementNum_NEG)
+TEST_F(IfTest, InvalidCondElementNum_NEG)
 {
-  Tensor cond = makeInputTensor<DataType::BOOL>({2}, {false, true});
-  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7});
-  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2});
+  Tensor cond = makeInputTensor<DataType::BOOL>({2}, {false, true}, _memory_manager.get());
+  Tensor input1 = makeInputTensor<DataType::FLOAT32>({2}, {5, 7}, _memory_manager.get());
+  Tensor input2 = makeInputTensor<DataType::FLOAT32>({1, 2}, {1, 2}, _memory_manager.get());
   Tensor output = makeOutputTensor(DataType::FLOAT32);
 
   RuntimeModule module(nullptr);
-  RuntimeGraph *then_graph = buildAddSubgraph(&module);
-  RuntimeGraph *else_graph = buildMulSubgraph(&module);
+  RuntimeGraph *then_graph = buildAddSubgraph(&module, _memory_manager.get());
+  RuntimeGraph *else_graph = buildMulSubgraph(&module, _memory_manager.get());
 
   If kernel(&cond, {&input1, &input2}, {&output}, then_graph, else_graph);
   EXPECT_ANY_THROW(kernel.configure());
diff --git a/compiler/luci-interpreter/src/kernels/InstanceNorm.test.cpp b/compiler/luci-interpreter/src/kernels/InstanceNorm.test.cpp
index 1d4ccb4cd..04400c3c0 100644
--- a/compiler/luci-interpreter/src/kernels/InstanceNorm.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/InstanceNorm.test.cpp
@@ -15,6 +15,7 @@
  */
 #include "kernels/InstanceNorm.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -24,11 +25,21 @@ namespace
 {
 
 using namespace testing;
-TEST(InstanceNormTest, Simple)
+
+class InstanceNormTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(InstanceNormTest, Simple)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 2, 1}, {1, 1, 1, 1});
-  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1});
-  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2});
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 2, 1}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   InstanceNormParams params{};
@@ -37,17 +48,19 @@ TEST(InstanceNormTest, Simple)
 
   InstanceNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear({2, 2, 2, 2}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
 }
 
-TEST(InstanceNormTest, Single_gamma_beta)
+TEST_F(InstanceNormTest, Single_gamma_beta)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1});
-  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1});
-  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2});
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   InstanceNormParams params{};
@@ -56,17 +69,19 @@ TEST(InstanceNormTest, Single_gamma_beta)
 
   InstanceNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear({2, 2, 2, 2}));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 2}));
 }
 
-TEST(InstanceNormTest, Wrong_gamma_beta_dim_NEG)
+TEST_F(InstanceNormTest, Wrong_gamma_beta_dim_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1});
-  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1, 1, 1});
-  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({3}, {2, 2, 2});
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 1, 2}, {1, 1, 1, 1}, _memory_manager.get());
+  Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1, 1, 1}, _memory_manager.get());
+  Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({3}, {2, 2, 2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   InstanceNormParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
index 2eaf5404e..64222953f 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
@@ -17,7 +17,7 @@
 #include "kernels/L2Normalize.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALL2Normalize.h"
 
 #include <stdexcept>
 
@@ -66,9 +66,9 @@ template <typename T> void L2Normalize::eval(int32_t zero_point) const
 {
   tflite::L2NormalizationParams op_params{};
   op_params.input_zero_point = zero_point;
-  tflite::optimized_ops::L2Normalization(op_params, getTensorShape(input()),
-                                         getTensorData<T>(input()), getTensorShape(output()),
-                                         getTensorData<T>(output()));
+  luci_interpreter_pal::L2Normalization(op_params, getTensorShape(input()),
+                                        getTensorData<T>(input()), getTensorShape(output()),
+                                        getTensorData<T>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index 6281b451b..1e565e358 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -16,6 +16,7 @@
  */
 #include "kernels/L2Normalize.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,7 +31,9 @@ template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   L2NormParams params{};
@@ -38,6 +41,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
   L2Normalize kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
@@ -50,12 +54,13 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> input_data,
                     std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::pair<float, int32_t> quant_param =
     quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
                                 std::max(input_data) > 0 ? std::max(input_data) : 0.f);
 
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 128., 128);
 
   L2NormParams params{};
@@ -63,6 +68,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
 
   L2Normalize kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -85,9 +91,11 @@ TYPED_TEST(L2NormalizeTest, Simple)
 
 TEST(L2NormalizeTest, ActivationType_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   L2NormParams params{};
@@ -99,9 +107,11 @@ TEST(L2NormalizeTest, ActivationType_NEG)
 
 TEST(L2NormalizeTest, InvalidOutputQuantParam_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 1, 1, 6}, 1. / 64., 127, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 1, 1, 6}, 1. / 64., 127, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 64., 127);
 
   L2NormParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
index 5bf3ba5a8..5a88808d5 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALL2Pool2D.h"
 
 #include <stdexcept>
 
@@ -75,9 +75,9 @@ void L2Pool2D::execute() const
       op_params.padding_values.width = _padding_width;
       op_params.float_activation_min = activation_min;
       op_params.float_activation_max = activation_max;
-      tflite::optimized_ops::L2Pool(op_params, getTensorShape(input()),
-                                    getTensorData<float>(input()), getTensorShape(output()),
-                                    getTensorData<float>(output()));
+      luci_interpreter_pal::L2Pool(op_params, getTensorShape(input()),
+                                   getTensorData<float>(input()), getTensorShape(output()),
+                                   getTensorData<float>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
index 52f426a08..289742a50 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/L2Pool2D.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,14 +28,23 @@ namespace
 
 using namespace testing;
 
-TEST(L2Pool2DTest, FloatNone)
+class L2Pool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(L2Pool2DTest, FloatNone)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -47,6 +57,7 @@ TEST(L2Pool2DTest, FloatNone)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.5};
@@ -54,14 +65,15 @@ TEST(L2Pool2DTest, FloatNone)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatRelu)
+TEST_F(L2Pool2DTest, FloatRelu)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     -1, -6, 2,  4, //
     -3, -2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -74,6 +86,7 @@ TEST(L2Pool2DTest, FloatRelu)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{3.53553, 6.5};
@@ -81,14 +94,15 @@ TEST(L2Pool2DTest, FloatRelu)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatRelu1)
+TEST_F(L2Pool2DTest, FloatRelu1)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     -0.1, -0.6, 2,  4, //
     -0.3, -0.2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -101,6 +115,7 @@ TEST(L2Pool2DTest, FloatRelu1)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.353553, 1.0};
@@ -108,14 +123,15 @@ TEST(L2Pool2DTest, FloatRelu1)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatRelu6)
+TEST_F(L2Pool2DTest, FloatRelu6)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     -0.1, -0.6, 2,  4, //
     -0.3, -0.2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -128,6 +144,7 @@ TEST(L2Pool2DTest, FloatRelu6)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.353553, 6.0};
@@ -135,14 +152,15 @@ TEST(L2Pool2DTest, FloatRelu6)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatPaddingSame)
+TEST_F(L2Pool2DTest, FloatPaddingSame)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -155,6 +173,7 @@ TEST(L2Pool2DTest, FloatPaddingSame)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.5};
@@ -162,14 +181,15 @@ TEST(L2Pool2DTest, FloatPaddingSame)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatPaddingSameStride)
+TEST_F(L2Pool2DTest, FloatPaddingSameStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -182,6 +202,7 @@ TEST(L2Pool2DTest, FloatPaddingSameStride)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.0, 6.5, 5.70088, 2.54951, 7.2111, 8.63134, 7.0};
@@ -189,14 +210,15 @@ TEST(L2Pool2DTest, FloatPaddingSameStride)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, FloatPaddingValidStride)
+TEST_F(L2Pool2DTest, FloatPaddingValidStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -209,6 +231,7 @@ TEST(L2Pool2DTest, FloatPaddingValidStride)
 
   L2Pool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{3.5, 6.0, 6.5};
@@ -216,14 +239,15 @@ TEST(L2Pool2DTest, FloatPaddingValidStride)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(L2Pool2DTest, InvalidInputShape_NEG)
+TEST_F(L2Pool2DTest, InvalidInputShape_NEG)
 {
   Shape input_shape{1, 2, 4};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -238,14 +262,15 @@ TEST(L2Pool2DTest, InvalidInputShape_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(L2Pool2DTest, InvalidInputOutputType_NEG)
+TEST_F(L2Pool2DTest, InvalidInputOutputType_NEG)
 {
   Shape input_shape{1, 2, 4};
   std::vector<float> input_data{
     0, 6, 2,  4, //
     3, 2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Pool2DParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
index f468da5d3..3833a55e8 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
@@ -18,8 +18,9 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+#include "PALLeakyRelu.h"
 
 #include <stdexcept>
 
@@ -66,9 +67,8 @@ void LeakyRelu::evalFloat() const
 {
   tflite::LeakyReluParams op_params{};
   op_params.alpha = params().alpha;
-  tflite::optimized_ops::LeakyRelu(op_params, getTensorShape(input()),
-                                   getTensorData<float>(input()), getTensorShape(output()),
-                                   getTensorData<float>(output()));
+  luci_interpreter_pal::LeakyRelu(op_params, getTensorShape(input()), getTensorData<float>(input()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
 }
 
 void LeakyRelu::evalQuantized() const
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index b5cc3e7fc..6ec8a348a 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/LeakyRelu.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -31,8 +32,10 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<float> input_data, std::initializer_list<float> output_data,
            float alpha)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   LeakyReluParams params{};
@@ -41,6 +44,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
   LeakyRelu kernel(&input_tensor, &output_tensor, params);
 
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -53,10 +57,11 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> input_data,
                     std::initializer_list<float> output_data, float alpha)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   const float quantized_tolerance = getTolerance(-8, 127.f / 16.f, 255);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-8, 127.f / 16.f);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   LeakyReluParams params{};
@@ -65,6 +70,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
   LeakyRelu kernel(&input_tensor, &output_tensor, params);
 
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -99,10 +105,13 @@ TYPED_TEST(LeakReluTest, Simple)
 
 TEST(LeakReluTest, IvalidInputOutputType_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, {
-                                                                     0.0f, 1.0f, 3.0f,   // Row 1
-                                                                     1.0f, -1.0f, -2.0f, // Row 2
-                                                                   });
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3},
+                                                           {
+                                                             0.0f, 1.0f, 3.0f,   // Row 1
+                                                             1.0f, -1.0f, -2.0f, // Row 2
+                                                           },
+                                                           memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   LeakyReluParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/Less.h b/compiler/luci-interpreter/src/kernels/Less.h
index fe03e10b1..293740e72 100644
--- a/compiler/luci-interpreter/src/kernels/Less.h
+++ b/compiler/luci-interpreter/src/kernels/Less.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Less.test.cpp b/compiler/luci-interpreter/src/kernels/Less.test.cpp
index 2972bd559..e9d09b288 100644
--- a/compiler/luci-interpreter/src/kernels/Less.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Less.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Less.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(LessTest, FloatSimple)
+class LessTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LessTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(LessTest, FloatSimple)
     false, false, true,  // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(LessTest, FloatBroardcast)
+TEST_F(LessTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -74,12 +84,13 @@ TEST(LessTest, FloatBroardcast)
     true,  true,  false, // Row 3
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -90,7 +101,7 @@ TEST(LessTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(LessTest, Uint8Quantized)
+TEST_F(LessTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -108,21 +119,22 @@ TEST(LessTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessTest, Uint8QuantizedRescale)
+TEST_F(LessTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -142,21 +154,22 @@ TEST(LessTest, Uint8QuantizedRescale)
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessTest, Uint8QuantizedBroadcast)
+TEST_F(LessTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -175,34 +188,35 @@ TEST(LessTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessTest, Input_Type_Mismatch_NEG)
+TEST_F(LessTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LessTest, Input_Output_Type_NEG)
+TEST_F(LessTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.h b/compiler/luci-interpreter/src/kernels/LessEqual.h
index ed4b0f1ea..b6da1a2a8 100644
--- a/compiler/luci-interpreter/src/kernels/LessEqual.h
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
index db65815a6..0558003dd 100644
--- a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LessEqual.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(LessEqualTest, FloatSimple)
+class LessEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LessEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(LessEqualTest, FloatSimple)
     false, true, true,  // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(LessEqualTest, FloatBroardcast)
+TEST_F(LessEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -74,12 +84,13 @@ TEST(LessEqualTest, FloatBroardcast)
     true,  true, false, // Row 3
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -90,7 +101,7 @@ TEST(LessEqualTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(LessEqualTest, Uint8Quantized)
+TEST_F(LessEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -108,21 +119,22 @@ TEST(LessEqualTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessEqualTest, Uint8QuantizedRescale)
+TEST_F(LessEqualTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
     0.5, 0.6, 0.7,  0.9, // Row 1
@@ -142,21 +154,22 @@ TEST(LessEqualTest, Uint8QuantizedRescale)
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessEqualTest, Uint8QuantizedBroadcast)
+TEST_F(LessEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -175,34 +188,35 @@ TEST(LessEqualTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 3, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(LessEqualTest, Input_Type_Mismatch_NEG)
+TEST_F(LessEqualTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LessEqualTest, Input_Output_Type_NEG)
+TEST_F(LessEqualTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
index fd2ec41a1..a2bf442b0 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALLocalResponseNormalization.h"
 
 #include <stdexcept>
 
@@ -52,7 +52,7 @@ void LocalResponseNormalization::execute() const
       op_params.bias = params().bias;
       op_params.alpha = params().alpha;
       op_params.beta = params().beta;
-      tflite::optimized_ops::LocalResponseNormalization(
+      luci_interpreter_pal::LocalResponseNormalization(
         op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(output()),
         getTensorData<float>(output()));
       break;
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
index 6a4331d34..4a9d4739f 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LocalResponseNormalization.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,10 +28,18 @@ namespace
 
 using namespace testing;
 
-TEST(LocalResponseNormalizationTest, SameAsL2Norm)
+class LocalResponseNormalizationTest : public ::testing::Test
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LocalResponseNormalizationTest, SameAsL2Norm)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -41,16 +50,17 @@ TEST(LocalResponseNormalizationTest, SameAsL2Norm)
 
   LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
               FloatArrayNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
-TEST(LocalResponseNormalizationTest, WithAlpha)
+TEST_F(LocalResponseNormalizationTest, WithAlpha)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -61,16 +71,17 @@ TEST(LocalResponseNormalizationTest, WithAlpha)
 
   LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
               FloatArrayNear({-0.275, 0.15, 0.175, 0.3, -0.175, 0.025}));
 }
 
-TEST(LocalResponseNormalizationTest, WithBias)
+TEST_F(LocalResponseNormalizationTest, WithBias)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -81,16 +92,17 @@ TEST(LocalResponseNormalizationTest, WithBias)
 
   LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
               FloatArrayNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02}));
 }
 
-TEST(LocalResponseNormalizationTest, SmallRadius)
+TEST_F(LocalResponseNormalizationTest, SmallRadius)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -101,16 +113,17 @@ TEST(LocalResponseNormalizationTest, SmallRadius)
 
   LocalResponseNormalization kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
               FloatArrayNear({-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266}));
 }
 
-TEST(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
+TEST_F(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -123,10 +136,10 @@ TEST(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LocalResponseNormalizationTest, InvalidInputOutputType_NEG)
+TEST_F(LocalResponseNormalizationTest, InvalidInputOutputType_NEG)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   LocalResponseNormalizationParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp b/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp
index 03d13e4ce..79c315338 100644
--- a/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.cpp
@@ -18,9 +18,9 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/log_softmax.h>
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALLogSoftmax.h"
 
 namespace luci_interpreter
 {
@@ -41,8 +41,7 @@ void LogSoftmax::configure()
 
     params.table = _table;
     params.beta = 1.0;
-
-    tflite::optimized_ops::PopulateSoftmaxLookupTable(&params, input()->scale(), params.beta);
+    luci_interpreter_pal::PopulateSoftmaxLookupTable(&params, input()->scale(), params.beta);
   }
   output()->resize(input()->shape());
 }
@@ -76,6 +75,7 @@ void LogSoftmax::evalQuantized() const
   const auto input_scale = input()->scale();
   uint8_t *output_data = getTensorData<uint8_t>(output());
   const uint8_t *input_data = getTensorData<uint8_t>(input());
+  const float beta = 1.0;
 
   tflite::SoftmaxParams params{};
 
@@ -83,8 +83,9 @@ void LogSoftmax::evalQuantized() const
   params.zero_point = output()->zero_point();
   params.scale = output()->scale();
 
-  tflite::optimized_ops::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
-                                    output_data);
+  luci_interpreter_pal::InitializeParams(&params, input_scale, beta);
+  luci_interpreter_pal::LogSoftmax(params, input_scale, input_shape, input_data, output_shape,
+                                   output_data);
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
index 8a90c1dd0..50dcd5c28 100644
--- a/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LogSoftmax.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,18 +28,28 @@ namespace
 
 using namespace testing;
 
-TEST(LogSoftmaxTest, Float)
+class LogSoftmaxTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogSoftmaxTest, Float)
 {
   Shape input_shape{2, 4};
   std::vector<float> input_data{
     0, -6, 2,  4, //
     3, -2, 10, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -48,7 +59,7 @@ TEST(LogSoftmaxTest, Float)
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(LogSoftmaxTest, Uint8)
+TEST_F(LogSoftmaxTest, Uint8)
 {
   float kMin = -10;
   float kMax = 10;
@@ -58,12 +69,13 @@ TEST(LogSoftmaxTest, Uint8)
     0, -6, 2,  4, //
     3, -2, 10, 1, //
   };
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -78,28 +90,29 @@ TEST(LogSoftmaxTest, Uint8)
               ::testing::ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
-TEST(LogSoftmaxTest, InvalidInputOutputType_NEG)
+TEST_F(LogSoftmaxTest, InvalidInputOutputType_NEG)
 {
   std::vector<float> input_data{
     0, -6, 2,  4, //
     3, -2, 10, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 4}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 4}, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LogSoftmaxTest, InvalidOutputQuantParam_NEG)
+TEST_F(LogSoftmaxTest, InvalidOutputQuantParam_NEG)
 {
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-10, 10);
   std::vector<float> input_data{
     0, -6, 2,  4, //
     3, -2, 10, 1, //
   };
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 20. / 256, 255);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LogicalAnd.test.cpp b/compiler/luci-interpreter/src/kernels/LogicalAnd.test.cpp
index 564f191d5..21b7951e0 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalAnd.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalAnd.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LogicalAnd.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,15 +28,26 @@ namespace
 
 using namespace testing;
 
-TEST(LogicalAndTest, Basic)
+class LogicalAndTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalAndTest, Basic)
 {
   Shape input_shape{1, 1, 1, 4};
-  Tensor input_tensor1 = makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true});
-  Tensor input_tensor2 = makeInputTensor<DataType::BOOL>(input_shape, {true, false, true, false});
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true}, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, true, false}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalAnd kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor),
@@ -43,14 +55,17 @@ TEST(LogicalAndTest, Basic)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
 }
 
-TEST(LogicalAndTest, Broadcast)
+TEST_F(LogicalAndTest, Broadcast)
 {
-  Tensor input_tensor1 = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true});
-  Tensor input_tensor2 = makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {true});
+  Tensor input_tensor1 = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {true}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalAnd kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor),
@@ -58,20 +73,23 @@ TEST(LogicalAndTest, Broadcast)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
 }
 
-TEST(LogicalAndTest, MismatchInputType_NEG)
+TEST_F(LogicalAndTest, MismatchInputType_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1});
-  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false});
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   LogicalAnd kernel(&input1_tensor, &input2_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LogicalAndTest, InputTypeInvalid_NEG)
+TEST_F(LogicalAndTest, InputTypeInvalid_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0});
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalAnd kernel(&input1_tensor, &input2_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LogicalNot.test.cpp b/compiler/luci-interpreter/src/kernels/LogicalNot.test.cpp
index dccb81102..3cbf27f6b 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalNot.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalNot.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LogicalNot.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,14 +28,24 @@ namespace
 
 using namespace testing;
 
-TEST(LogicalNotTest, Basic)
+class LogicalNotTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalNotTest, Basic)
 {
   Shape input_shape{1, 1, 1, 4};
-  Tensor input_tensor = makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true});
+  Tensor input_tensor =
+    makeInputTensor<DataType::BOOL>(input_shape, {true, false, false, true}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalNot kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor),
@@ -42,18 +53,20 @@ TEST(LogicalNotTest, Basic)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
 }
 
-TEST(LogicalNotTest, OutputTypeInvalid_NEG)
+TEST_F(LogicalNotTest, OutputTypeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true});
+  Tensor input_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                        _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   LogicalNot kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LogicalNotTest, InputTypeInvalid_NEG)
+TEST_F(LogicalNotTest, InputTypeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1});
+  Tensor input_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalNot kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LogicalOr.cpp b/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
index 7027a2a8b..f289ca64f 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
@@ -20,8 +20,6 @@
 #include "kernels/Utils.h"
 #include "kernels/BinaryOpCommon.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
-
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LogicalOr.test.cpp b/compiler/luci-interpreter/src/kernels/LogicalOr.test.cpp
index 677eac96a..d65a69a5e 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalOr.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalOr.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/LogicalOr.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,15 +28,26 @@ namespace
 
 using namespace testing;
 
-TEST(LogicalOrTest, Basic)
+class LogicalOrTest : public ::testing::Test
 {
-  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true});
-  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, true, false});
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(LogicalOrTest, Basic)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, true, false},
+                                                         _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor),
@@ -43,15 +55,18 @@ TEST(LogicalOrTest, Basic)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
 }
 
-TEST(LogicalOrTest, Broadcast)
+TEST_F(LogicalOrTest, Broadcast)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true});
-  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false});
+  Tensor input1_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 4}, {true, false, false, true},
+                                                         _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LogicalOr kernel(&input1_tensor, &input2_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor),
@@ -59,10 +74,12 @@ TEST(LogicalOrTest, Broadcast)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAre(1, 1, 1, 4));
 }
 
-TEST(LogicalOrTest, MismatchInputType_NEG)
+TEST_F(LogicalOrTest, MismatchInputType_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1});
-  Tensor input2_tensor = makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false});
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::BOOL>({1, 1, 1, 1}, {false}, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
@@ -70,10 +87,11 @@ TEST(LogicalOrTest, MismatchInputType_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(LogicalOrTest, InputTypeInvalid_NEG)
+TEST_F(LogicalOrTest, InputTypeInvalid_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0});
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>({1, 1, 1, 4}, {1, 0, 0, 1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1, 1, 1, 1}, {0}, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.cpp b/compiler/luci-interpreter/src/kernels/Logistic.cpp
index 97d7bf13d..58e4f185d 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/logistic.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index 41369a417..70227563f 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Logistic.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,11 +31,15 @@ template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<getElementType<T>()>(input_shape, input_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<getElementType<T>()>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(getElementType<T>());
 
   Logistic kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
@@ -47,14 +52,18 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> input_data,
                     std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::pair<float, int32_t> input_quant_param =
     quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
 
   Logistic kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -107,9 +116,12 @@ TYPED_TEST(LogisticTest, Simple)
 
 TEST(LogisticTest, IvalidInputOutputType_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape = {1};
   std::vector<float> input_data{10};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
 
   Logistic kernel(&input_tensor, &output_tensor);
@@ -118,11 +130,13 @@ TEST(LogisticTest, IvalidInputOutputType_NEG)
 
 TEST(LogisticTest, IvalidQuantParam_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   Shape input_shape = {2};
   std::vector<float> input_data{-10, 10};
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-10, 10);
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 255, 0);
 
   Logistic kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
index b9991f7ec..44f2a222f 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/MaxPool2D.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(MaxPool2DTest, Float)
+class MaxPool2DTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MaxPool2DTest, Float)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
@@ -34,7 +43,8 @@ TEST(MaxPool2DTest, Float)
     -7, -6, -5, -4, -3, //
     5,  4,  3,  6,  7,  //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pool2DParams params{};
@@ -47,6 +57,7 @@ TEST(MaxPool2DTest, Float)
 
   MaxPool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -58,15 +69,15 @@ TEST(MaxPool2DTest, Float)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MaxPool2DTest, Uint8)
+TEST_F(MaxPool2DTest, Uint8)
 {
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375, 15.9375);
   std::vector<float> input_data{
     0,  -6, 12, 4, //
     -3, -2, 10, 7, //
   };
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -79,6 +90,7 @@ TEST(MaxPool2DTest, Uint8)
 
   MaxPool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.0, 6.0};
@@ -87,7 +99,7 @@ TEST(MaxPool2DTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MaxPool2DTest, SInt16)
+TEST_F(MaxPool2DTest, SInt16)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
@@ -101,7 +113,8 @@ TEST(MaxPool2DTest, SInt16)
     5, 6, //
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.2, 0, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, 0.2, 0, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
 
   Pool2DParams params{};
@@ -114,6 +127,7 @@ TEST(MaxPool2DTest, SInt16)
 
   MaxPool2D kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
diff --git a/compiler/luci-interpreter/src/kernels/Maximum.test.cpp b/compiler/luci-interpreter/src/kernels/Maximum.test.cpp
index 2ddaeaf04..e4a505b03 100644
--- a/compiler/luci-interpreter/src/kernels/Maximum.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Maximum.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Maximum.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,34 +28,48 @@ namespace
 
 using namespace testing;
 
-TEST(MaximumTest, Float)
+class MaximumTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MaximumTest, Float)
 {
   Shape input_shape{3, 1, 2};
   std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
   std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{1.0, 0.0, 1.0, 12.0, -2.0, -1.43};
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(MaximumTest, Uint8)
+TEST_F(MaximumTest, Uint8)
 {
   Shape input_shape{3, 1, 2};
   std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
   std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
-  Tensor input_tensor1 = makeInputTensor<DataType::U8>(input_shape, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::U8>(input_shape, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::U8>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::U8>(input_shape, input_data2, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Maximum kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<int32_t> ref_output_shape{2, 4};
diff --git a/compiler/luci-interpreter/src/kernels/Mean.cpp b/compiler/luci-interpreter/src/kernels/Mean.cpp
index 421632812..8e65e0d6d 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/reduce.h>
 
 #include <stdexcept>
 
@@ -28,7 +28,7 @@ namespace luci_interpreter
 namespace kernels
 {
 
-static void resolveAxes(const int *axes_data, int num_axes, tflite::MeanParams *params)
+static void resolveAxes(const int32_t *axes_data, int num_axes, tflite::MeanParams *params)
 {
   params->axis_count = num_axes;
   for (int i = 0; i < num_axes; ++i)
@@ -42,7 +42,7 @@ static void resolveAxes(const int *axes_data, int num_axes, tflite::MeanParams *
 }
 
 // Returns the number of axes that will be reduced. Removes duplicates.
-static int getAxisReductionCount(const int *axes_data, int num_axes, int input_num_dims)
+static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims)
 {
   int reduction_count = num_axes;
   for (int i = 0; i < num_axes; ++i)
@@ -63,7 +63,7 @@ static int getAxisReductionCount(const int *axes_data, int num_axes, int input_n
   return reduction_count;
 }
 
-static Shape getOutputShape(const Shape &input_shape, const int *axes_data, int num_axes,
+static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes,
                             bool keep_dims)
 {
   int input_num_dims = input_shape.num_dims();
@@ -123,8 +123,10 @@ static Shape getOutputShape(const Shape &input_shape, const int *axes_data, int
   }
 }
 
-Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, const ReducerParams &params)
-  : KernelWithParams<ReducerParams>({input, axes}, {output}, params)
+Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+           Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params)
+  : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes, temp_sum},
+                                    params)
 {
 }
 
@@ -149,17 +151,28 @@ void Mean::configure()
 
   tflite::MeanParams params{};
   resolveAxes(axes_data, num_axes, &params);
-  const bool need_temporaries = !(
+  _need_temporaries = !(
     _params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
     ((params.axis[0] == 1 && params.axis[1] == 2) || (params.axis[0] == 2 && params.axis[1] == 1)));
-  if (need_temporaries)
+  if (_need_temporaries)
   {
-    _temp_index =
-      std::make_unique<Tensor>(DataType::S32, Shape(input_num_dims), AffineQuantization{}, "");
-    _resolved_axes =
-      std::make_unique<Tensor>(DataType::S32, Shape(num_axes), AffineQuantization{}, "");
-    _temp_sum = std::make_unique<Tensor>(input()->element_type(), output()->shape(),
-                                         AffineQuantization{}, "");
+    auto temp_index = getOutputTensors()[1];
+    auto resolved_axes = getOutputTensors()[2];
+    auto temp_sum = getOutputTensors()[3];
+
+    temp_index->resize(Shape(input_num_dims));
+    resolved_axes->resize(Shape(num_axes));
+    temp_sum->resize(output()->shape());
+  }
+  else
+  {
+    auto temp_index = getOutputTensors()[1];
+    auto resolved_axes = getOutputTensors()[2];
+    auto temp_sum = getOutputTensors()[3];
+
+    temp_index->set_allocatable(false);
+    resolved_axes->set_allocatable(false);
+    temp_sum->set_allocatable(false);
   }
 }
 
@@ -179,12 +192,6 @@ void Mean::execute() const
     default:
       throw std::runtime_error("Unsupported type.");
   }
-  if (!!_temp_index)
-    _temp_index->deallocate();
-  if (!!_resolved_axes)
-    _resolved_axes->deallocate();
-  if (!!_temp_sum)
-    _temp_sum->deallocate();
 }
 
 void Mean::evalFloat() const
@@ -197,6 +204,10 @@ void Mean::evalFloat() const
   tflite::MeanParams params{};
   resolveAxes(axes_data, num_axes, &params);
 
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+  auto temp_sum = getOutputTensors()[3];
+
   // Defer to specialized implementation for 4D Mean across axes 1 & 2.
   if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
       ((params.axis[0] == 1 && params.axis[1] == 2) ||
@@ -207,12 +218,12 @@ void Mean::evalFloat() const
   }
   else
   {
-    tflite::reference_ops::Mean(
-      getTensorData<float>(input()), getTensorShape(input()).DimsData(),
-      input()->shape().num_dims(), getTensorData<float>(output()),
-      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-      _params.keep_dims, getTensorData<int>(_temp_index.get()),
-      getTensorData<int>(_resolved_axes.get()), getTensorData<float>(_temp_sum.get()));
+    tflite::reference_ops::Mean(getTensorData<float>(input()), getTensorShape(input()).DimsData(),
+                                input()->shape().num_dims(), getTensorData<float>(output()),
+                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
+                                axes_data, num_axes, _params.keep_dims,
+                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+                                getTensorData<float>(temp_sum));
   }
 }
 
@@ -226,6 +237,10 @@ void Mean::evalQuantized() const
   tflite::MeanParams params{};
   resolveAxes(axes_data, num_axes, &params);
 
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+  auto temp_sum = getOutputTensors()[3];
+
   // Defer to specialized implementation for 4D Mean across axes 1 & 2.
   if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
       ((params.axis[0] == 1 && params.axis[1] == 2) ||
@@ -238,12 +253,12 @@ void Mean::evalQuantized() const
   }
   else if (input()->zero_point() == output()->zero_point() && input()->scale() == output()->scale())
   {
-    tflite::reference_ops::Mean(
-      getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
-      input()->shape().num_dims(), getTensorData<uint8_t>(output()),
-      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-      _params.keep_dims, getTensorData<int>(_temp_index.get()),
-      getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()));
+    tflite::reference_ops::Mean(getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
+                                input()->shape().num_dims(), getTensorData<uint8_t>(output()),
+                                getTensorShape(output()).DimsData(), output()->shape().num_dims(),
+                                axes_data, num_axes, _params.keep_dims,
+                                getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+                                getTensorData<int>(temp_sum));
   }
   else
   {
@@ -252,8 +267,8 @@ void Mean::evalQuantized() const
       getTensorShape(input()).DimsData(), input()->shape().num_dims(),
       getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(),
       getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-      _params.keep_dims, getTensorData<int>(_temp_index.get()),
-      getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()),
+      _params.keep_dims, getTensorData<int>(temp_index), getTensorData<int>(resolved_axes),
+      getTensorData<int>(temp_sum),
       /*compute_sum=*/false);
   }
 }
diff --git a/compiler/luci-interpreter/src/kernels/Mean.h b/compiler/luci-interpreter/src/kernels/Mean.h
index 1cc046894..ed07ae561 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.h
+++ b/compiler/luci-interpreter/src/kernels/Mean.h
@@ -30,7 +30,8 @@ namespace kernels
 class Mean : public KernelWithParams<ReducerParams>
 {
 public:
-  Mean(const Tensor *input, const Tensor *axes, Tensor *output, const ReducerParams &params);
+  Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+       Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams &params);
 
   const Tensor *input() const { return _inputs[0]; }
   const Tensor *axes() const { return _inputs[1]; }
@@ -45,9 +46,7 @@ private:
   void evalQuantizedS16() const;
 
 private:
-  std::unique_ptr<Tensor> _temp_index;
-  std::unique_ptr<Tensor> _resolved_axes;
-  std::unique_ptr<Tensor> _temp_sum;
+  bool _need_temporaries = false;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Mean.test.cpp b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
index fa0ba2169..d2c00935a 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Mean.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,22 +28,39 @@ namespace
 
 using namespace testing;
 
-TEST(MeanTest, FloatKeepDims)
+class MeanTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MeanTest, FloatKeepDims)
 {
   std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                                    9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
                                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
 
   std::vector<int32_t> axis_data{0, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ReducerParams params{};
   params.keep_dims = true;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{10.5, 12.5, 14.5};
@@ -51,22 +69,31 @@ TEST(MeanTest, FloatKeepDims)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MeanTest, FloatKeepDims4DMean)
+TEST_F(MeanTest, FloatKeepDims4DMean)
 {
   std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                                    9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
                                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
 
   std::vector<int32_t> axis_data{1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 3, 2}, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 2, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ReducerParams params{};
   params.keep_dims = true;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{6, 7, 18, 19};
@@ -75,22 +102,31 @@ TEST(MeanTest, FloatKeepDims4DMean)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MeanTest, FloatNotKeepDims)
+TEST_F(MeanTest, FloatNotKeepDims)
 {
   std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                                    9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
                                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
 
   std::vector<int32_t> axis_data{1, 0, -3, -3};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ReducerParams params{};
   params.keep_dims = false;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{12, 13};
@@ -99,23 +135,31 @@ TEST(MeanTest, FloatNotKeepDims)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MeanTest, Uint8KeepDims)
+TEST_F(MeanTest, Uint8KeepDims)
 {
   float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
   std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
   std::vector<int32_t> axis_data{1};
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second,
+                                                      input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::U8, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   ReducerParams params{};
   params.keep_dims = true;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.3, 0.35, 0.55};
@@ -125,23 +169,31 @@ TEST(MeanTest, Uint8KeepDims)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MeanTest, Uint8NotKeepDims)
+TEST_F(MeanTest, Uint8NotKeepDims)
 {
   float kQuantizedTolerance = getTolerance(-1.0, 1.0, 255);
   std::vector<float> input_data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
   std::vector<int32_t> axis_data{1};
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 3, 2}, quant_param.first, quant_param.second, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 3, 2}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   ReducerParams params{};
   params.keep_dims = false;
 
-  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.4, 0.4};
@@ -151,7 +203,7 @@ TEST(MeanTest, Uint8NotKeepDims)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(MeanTest, SInt16KeepDims4D)
+TEST_F(MeanTest, SInt16KeepDims4D)
 {
   std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                                    9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -159,15 +211,24 @@ TEST(MeanTest, SInt16KeepDims4D)
   std::vector<int32_t> axes_data{1, 2};
   std::vector<float> ref_output_data{6, 7, 18, 19};
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>({2, 2, 3, 2}, 0.25, 0, input_data);
-  Tensor axes_tensor = makeInputTensor<DataType::S32>({2}, axes_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({2, 2, 3, 2}, 0.25, 0, input_data, _memory_manager.get());
+  Tensor axes_tensor = makeInputTensor<DataType::S32>({2}, axes_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor temp_sum(DataType::FLOAT32, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.2, 0);
 
   ReducerParams params{};
   params.keep_dims = true;
 
-  Mean kernel(&input_tensor, &axes_tensor, &output_tensor, params);
+  Mean kernel(&input_tensor, &axes_tensor, &output_tensor, &temp_index, &resolved_axes, &temp_sum,
+              params);
   kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(temp_sum);
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 1, 2}));
diff --git a/compiler/luci-interpreter/src/kernels/Minimum.test.cpp b/compiler/luci-interpreter/src/kernels/Minimum.test.cpp
index b6420dd9b..9a143643f 100644
--- a/compiler/luci-interpreter/src/kernels/Minimum.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Minimum.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Minimum.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,34 +28,48 @@ namespace
 
 using namespace testing;
 
-TEST(MinimumTest, Float)
+class MinimumTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MinimumTest, Float)
 {
   Shape input_shape{3, 1, 2};
   std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
   std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{-1.0, 0.0, -1.0, 11.0, -3.0, -1.44};
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(MinimumTest, Uint8)
+TEST_F(MinimumTest, Uint8)
 {
   Shape input_shape{3, 1, 2};
   std::vector<uint8_t> input_data1{1, 0, 2, 11, 2, 23};
   std::vector<uint8_t> input_data2{0, 0, 1, 12, 255, 1};
-  Tensor input_tensor1 = makeInputTensor<DataType::U8>(input_shape, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::U8>(input_shape, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::U8>(input_shape, input_data1, _memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::U8>(input_shape, input_data2, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Minimum kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<int32_t> ref_output_shape{2, 4};
diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
index 1139167e0..89049c96c 100644
--- a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
+++ b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/pad.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp
new file mode 100644
index 000000000..de9da5051
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO: Add tests for MirrorPad
diff --git a/compiler/luci-interpreter/src/kernels/Mul.cpp b/compiler/luci-interpreter/src/kernels/Mul.cpp
index 4e6e3f75a..bc855de0f 100644
--- a/compiler/luci-interpreter/src/kernels/Mul.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mul.cpp
@@ -20,7 +20,9 @@
 #include "kernels/BinaryOpCommon.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALMul.h"
+
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
 
 #include <stdexcept>
 
@@ -77,15 +79,15 @@ void Mul::evalFloat() const
 
   if (need_broadcast)
   {
-    tflite::optimized_ops::BroadcastMul4DSlow(
+    luci_interpreter_pal::BroadcastMul4DSlow(
       params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
       getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
   }
   else
   {
-    tflite::optimized_ops::Mul(params, getTensorShape(input1()), getTensorData<float>(input1()),
-                               getTensorShape(input2()), getTensorData<float>(input2()),
-                               getTensorShape(output()), getTensorData<float>(output()));
+    luci_interpreter_pal::Mul(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                              getTensorShape(input2()), getTensorData<float>(input2()),
+                              getTensorShape(output()), getTensorData<float>(output()));
   }
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Mul.test.cpp b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
index fc7ffb5a1..471f6ac86 100644
--- a/compiler/luci-interpreter/src/kernels/Mul.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Mul.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(MulTest, Float)
+class MulTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(MulTest, Float)
 {
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
@@ -45,8 +54,10 @@ TEST(MulTest, Float)
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
-    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
     MulParams params{};
@@ -54,6 +65,7 @@ TEST(MulTest, Float)
 
     Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
@@ -62,8 +74,10 @@ TEST(MulTest, Float)
   // Re-run with exchanged inputs.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
-    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
     MulParams params{};
@@ -71,6 +85,7 @@ TEST(MulTest, Float)
 
     Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
@@ -78,7 +93,7 @@ TEST(MulTest, Float)
   }
 }
 
-TEST(MulTest, SInt16)
+TEST_F(MulTest, SInt16)
 {
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
@@ -99,9 +114,10 @@ TEST(MulTest, SInt16)
     {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data);
-    Tensor input2_tensor =
-      makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
     const float tolerance = output_tensor.scale() * 2;
 
@@ -110,6 +126,7 @@ TEST(MulTest, SInt16)
 
     Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorShape(output_tensor),
@@ -121,9 +138,10 @@ TEST(MulTest, SInt16)
   // Re-run with exchanged inputs and different scales.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor =
-      makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
-    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data);
+    Tensor input1_tensor = makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0,
+                                                          input2_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data,
+                                                          _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::S16, 3.0 / 32767, 0);
     const float tolerance = output_tensor.scale() * 2;
 
@@ -132,6 +150,7 @@ TEST(MulTest, SInt16)
 
     Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorShape(output_tensor),
diff --git a/compiler/luci-interpreter/src/kernels/Neg.cpp b/compiler/luci-interpreter/src/kernels/Neg.cpp
index 99f4d4a21..c6fe08a9e 100644
--- a/compiler/luci-interpreter/src/kernels/Neg.cpp
+++ b/compiler/luci-interpreter/src/kernels/Neg.cpp
@@ -17,7 +17,7 @@
 #include "kernels/Neg.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALNeg.h"
 
 #include <stdexcept>
 
@@ -50,8 +50,8 @@ void Neg::execute() const
 
 void Neg::evalFloat() const
 {
-  tflite::reference_ops::Negate(getTensorShape(input()), getTensorData<float>(input()),
-                                getTensorShape(output()), getTensorData<float>(output()));
+  luci_interpreter_pal::Negate(getTensorShape(input()), getTensorData<float>(input()),
+                               getTensorShape(output()), getTensorData<float>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Neg.test.cpp b/compiler/luci-interpreter/src/kernels/Neg.test.cpp
index 33256e1c6..8b2bc1a82 100644
--- a/compiler/luci-interpreter/src/kernels/Neg.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Neg.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Neg.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -31,13 +32,16 @@ template <typename T>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<T> input_data, std::initializer_list<T> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   Neg kernel(&input_tensor, &output_tensor);
 
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.h b/compiler/luci-interpreter/src/kernels/NotEqual.h
index d729c6c14..247874df7 100644
--- a/compiler/luci-interpreter/src/kernels/NotEqual.h
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.h
@@ -42,9 +42,9 @@ private:
 
 private:
   int32_t _x_multiplier = 0;
-  int32_t _x_shift = 0;
+  int _x_shift = 0;
   int32_t _y_multiplier = 0;
-  int32_t _y_shift = 0;
+  int _y_shift = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
index f9dc7781b..763f86893 100644
--- a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/NotEqual.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(NotEqualTest, FloatSimple)
+class NotEqualTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(NotEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -44,19 +53,20 @@ TEST(NotEqualTest, FloatSimple)
     true, false, true, // Row 2
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(NotEqualTest, FloatBroardcast)
+TEST_F(NotEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
     0.5, 0.7, 0.9, // Row 1
@@ -76,12 +86,13 @@ TEST(NotEqualTest, FloatBroardcast)
     false, false, false, // Row 4
   };
 
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1, 3}, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
@@ -92,7 +103,7 @@ TEST(NotEqualTest, FloatBroardcast)
 const float F_MIN = -128.0 / 128.0;
 const float F_MAX = 127.0 / 128.0;
 
-TEST(NotEqualTest, Uint8Quantized)
+TEST_F(NotEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
     0.5, 0.5, 0.7,  0.9, // Row 1
@@ -110,24 +121,25 @@ TEST(NotEqualTest, Uint8Quantized)
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data, _memory_manager.get());
 
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data, _memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(NotEqualTest, Uint8QuantizedBroadcast)
+TEST_F(NotEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
     0.4,  -0.8, 0.7,  0.3, // Row 1
@@ -148,34 +160,35 @@ TEST(NotEqualTest, Uint8QuantizedBroadcast)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor =
-    makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
-  Tensor y_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+  Tensor x_tensor = makeInputTensor<DataType::U8>(
+    {1, 4, 4, 1}, quant_param.first, quant_param.second, x_data, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 4, 1}, quant_param.first, quant_param.second, y_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 4, 1}));
   EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data));
 }
 
-TEST(NotEqualTest, Input_Type_Mismatch_NEG)
+TEST_F(NotEqualTest, Input_Type_Mismatch_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::U8>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(NotEqualTest, Input_Output_Type_NEG)
+TEST_F(NotEqualTest, Input_Output_Type_NEG)
 {
-  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/PRelu.cpp b/compiler/luci-interpreter/src/kernels/PRelu.cpp
index a53ac6f80..5a6b05c3a 100644
--- a/compiler/luci-interpreter/src/kernels/PRelu.cpp
+++ b/compiler/luci-interpreter/src/kernels/PRelu.cpp
@@ -19,7 +19,8 @@
 #include "kernels/BinaryOpCommon.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/binary_function.h>
+#include <tensorflow/lite/kernels/internal/reference/prelu.h>
 
 #include <stdexcept>
 
@@ -168,10 +169,11 @@ static inline int16_t evalElemS16PRelu(int16_t input_val, int16_t alpha_val,
   constexpr int32_t quantized_max = std::numeric_limits<int16_t>::max();
 
   const int32_t output_val =
-    input_val >= 0 ? tflite::MultiplyByQuantizedMultiplier(input_val, identity_mult.multiplier,
-                                                           identity_mult.shift)
-                   : tflite::MultiplyByQuantizedMultiplier(input_val * alpha_val,
-                                                           alpha_mult.multiplier, alpha_mult.shift);
+    input_val >= 0
+      ? tflite::MultiplyByQuantizedMultiplier(static_cast<int32_t>(input_val),
+                                              identity_mult.multiplier, identity_mult.shift)
+      : tflite::MultiplyByQuantizedMultiplier(static_cast<int32_t>(input_val * alpha_val),
+                                              alpha_mult.multiplier, alpha_mult.shift);
   const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_val));
   return clamped_output;
 }
diff --git a/compiler/luci-interpreter/src/kernels/PRelu.h b/compiler/luci-interpreter/src/kernels/PRelu.h
index e85c3f7e9..f7735d418 100644
--- a/compiler/luci-interpreter/src/kernels/PRelu.h
+++ b/compiler/luci-interpreter/src/kernels/PRelu.h
@@ -50,7 +50,7 @@ private:
   std::vector<ChannelQuantMultipliers> _alpha_multipliers;
   // TODO merge this into one ChannelQuantMultiplier object
   int32_t _output_multiplier_identity = 0;
-  int32_t _output_shift_identity = 0;
+  int _output_shift_identity = 0;
 };
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/PRelu.test.cpp b/compiler/luci-interpreter/src/kernels/PRelu.test.cpp
index 3dbc51cc1..6d97382de 100644
--- a/compiler/luci-interpreter/src/kernels/PRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/PRelu.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/PRelu.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -32,14 +33,18 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
            std::initializer_list<T> alpha_data, std::initializer_list<T> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
-  Tensor alpha_tensor = makeInputTensor<element_type>(alpha_shape, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<element_type>(alpha_shape, alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
 
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -97,6 +102,7 @@ float GetTolerance(float min, float max) { return (max - min) / 255.0; }
 
 TEST(PReluTest, Uint8Simple)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{-0.8f, 0.2f, 0.9f, 0.7f, 0.1f, -0.4f};
   std::vector<float> alpha_data{0.5f, 0.5f, 0.5f, 0.25f, 1.0f, 0.25f};
   std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, 0.7f, 0.1f, -0.1f};
@@ -104,14 +110,15 @@ TEST(PReluTest, Uint8Simple)
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, alpha_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -147,14 +154,16 @@ TEST(PReluTest, Uint8Broadcast)
   const float kMax = 127.f / 128.f;
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
 
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 2, 3}, quant_param.first, quant_param.second, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::U8>({1, 1, 3}, quant_param.first, quant_param.second, alpha_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 3}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>(
+    {1, 1, 3}, quant_param.first, quant_param.second, alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -166,12 +175,15 @@ TEST(PReluTest, Uint8Broadcast)
 
 TEST(PReluTest, SInt16_LWQ_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   // Rewrite this test in case layer-wise quantization for sint16 is supported
   std::vector<float> input_data(6); // data is not important
   std::vector<float> alpha_data(6);
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, input_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -180,18 +192,22 @@ TEST(PReluTest, SInt16_LWQ_NEG)
 
 TEST(PReluTest, SInt16_CWQ_Simple)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
   std::vector<float> alpha_data{0.5f, 0.25f};
   std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
 
   std::vector<float> alpha_scales{0.05f, 0.025f};
   std::vector<int32_t> zerop{0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2}, alpha_scales, zerop, 0, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2}, alpha_scales, zerop, 0, alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
@@ -200,14 +216,16 @@ TEST(PReluTest, SInt16_CWQ_Simple)
 
 TEST(PReluTest, SInt16_CWQ_spatial_alpha_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data(6); // data is not important
   std::vector<float> alpha_data(6);
 
   std::vector<float> alpha_scales{0.25f, 0.05f};
   std::vector<int32_t> zerop{0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::S16>({1, 1, 3, 2}, alpha_scales, zerop, 3, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, alpha_scales, zerop, 3,
+                                                       alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -216,14 +234,16 @@ TEST(PReluTest, SInt16_CWQ_spatial_alpha_NEG)
 
 TEST(PReluTest, SInt16_CWQ_wrong_dim_quant_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data(6); // data is not important
   std::vector<float> alpha_data(6);
 
   std::vector<float> alpha_scales{0.25f};
   std::vector<int32_t> zerop{0};
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::S16>({1, 1, 1, 2}, alpha_scales, zerop, 1, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 1, 2}, alpha_scales, zerop, 1,
+                                                       alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -232,19 +252,22 @@ TEST(PReluTest, SInt16_CWQ_wrong_dim_quant_NEG)
 
 TEST(PReluTest, SInt16_CWQ_uneven_shape1)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
   std::vector<float> alpha_data{0.5f, 0.25f};
   std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
 
   std::vector<float> alpha_scales{0.05f, 0.025f};
   std::vector<int32_t> zerop{0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::S16>({1, 1, 2}, alpha_scales, zerop, 2, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 2}, alpha_scales, zerop, 2,
+                                                       alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
@@ -253,6 +276,7 @@ TEST(PReluTest, SInt16_CWQ_uneven_shape1)
 
 TEST(PReluTest, SInt16_CWQ_uneven_shape2)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{
     0.0f,   0.0f,   0.0f,   // Row 1, Column 1
     0.5f,   0.5f,   0.5f,   // Row 1, Column 2
@@ -269,13 +293,15 @@ TEST(PReluTest, SInt16_CWQ_uneven_shape2)
 
   std::vector<float> alpha_scales{1.f, 0.05f, 0.1f};
   std::vector<int32_t> zerop{0, 0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 2, 3}, 0.01, 0, input_data);
-  Tensor alpha_tensor =
-    makeInputTensor<DataType::S16>({1, 1, 1, 3}, alpha_scales, zerop, 3, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 2, 3}, 0.01, 0, input_data, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 1, 3}, alpha_scales, zerop, 3,
+                                                       alpha_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.001, 0);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 3}));
@@ -284,8 +310,9 @@ TEST(PReluTest, SInt16_CWQ_uneven_shape2)
 
 TEST(PReluTest, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor alpha_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -294,8 +321,9 @@ TEST(PReluTest, Input_Output_Type_NEG)
 
 TEST(PReluTest, Input_Alpha_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1}, {1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1}, {1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -304,23 +332,29 @@ TEST(PReluTest, Input_Alpha_Type_NEG)
 
 TEST(PReluTest, Invalid_Input_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
-  Tensor alpha_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
+  Tensor alpha_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
 TEST(PReluTest, Input_Output_U8_CWQ_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> scales{1.f, 1.f};
   std::vector<int32_t> zerop{0, 0};
   std::vector<float> dummy_data(4, 0.f);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
-  Tensor output_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
@@ -328,12 +362,16 @@ TEST(PReluTest, Input_Output_U8_CWQ_NEG)
 
 TEST(PReluTest, Input_Output_S16_CWQ_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> scales{1.f, 1.f};
   std::vector<int32_t> zerop{0, 0};
   std::vector<float> dummy_data(4, 0.f);
-  Tensor input_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
-  Tensor output_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data, memory_manager.get());
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
@@ -341,10 +379,14 @@ TEST(PReluTest, Input_Output_S16_CWQ_NEG)
 
 TEST(PReluTest, Mixing_U8_S16_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> dummy_data(4, 0.f);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2, 2}, 1.f, 0, dummy_data);
-  Tensor output_tensor = makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
+  Tensor output_tensor =
+    makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data, memory_manager.get());
 
   PRelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
diff --git a/compiler/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
index 092bd449a..90a0f894e 100644
--- a/compiler/luci-interpreter/src/kernels/Pack.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Pack.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -31,6 +32,7 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
            std::initializer_list<int32_t> output_shape, std::vector<std::vector<T>> input_datas,
            std::initializer_list<T> output_data, int32_t axis)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
   std::vector<const Tensor *> inputs(input_datas.size());
   std::vector<Tensor> tmp_inputs;
@@ -39,11 +41,13 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
     if (std::is_same<T, float>::value)
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
     else
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f / 255}, {128}}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
   }
@@ -64,6 +68,7 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
   Pack kernel(inputs, &output_tensor, params);
 
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -103,12 +108,13 @@ TYPED_TEST(PackTest, NegAxis)
 
 TEST(Pack, MismatchingInputValuesCount_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input1_data{1, 4};
   std::vector<float> input2_data{2, 5};
   std::vector<float> input3_data{3, 6};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data);
-  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data);
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data, memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data, memory_manager.get());
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   PackParams params{};
   {
@@ -122,12 +128,13 @@ TEST(Pack, MismatchingInputValuesCount_NEG)
 
 TEST(Pack, InvalidInputAxis_NEG)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input1_data{1, 4};
   std::vector<float> input2_data{2, 5};
   std::vector<float> input3_data{3, 6};
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data);
-  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data);
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data, memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data, memory_manager.get());
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
   PackParams params{};
   {
diff --git a/compiler/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-interpreter/src/kernels/Pad.cpp
index 3e76080a9..700448e7a 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/pad.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/Pad.test.cpp b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
index 75b2e560e..7994263e2 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Pad.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,17 +31,20 @@ float GetTolerance(float min, float max) { return (max - min) / 255.0; }
 
 TEST(Pad, Uint8)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
   std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
   std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, input_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
@@ -52,14 +56,18 @@ TEST(Pad, Uint8)
 
 TEST(Pad, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{1, 2, 3, 4, 5, 6};
   std::vector<int32_t> paddings_data{1, 0, 0, 2, 0, 3, 0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pad kernel(&input_tensor, &paddings_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/compiler/luci-interpreter/src/kernels/PadV2.cpp b/compiler/luci-interpreter/src/kernels/PadV2.cpp
index 3c215dbca..e90469239 100644
--- a/compiler/luci-interpreter/src/kernels/PadV2.cpp
+++ b/compiler/luci-interpreter/src/kernels/PadV2.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/pad.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/PadV2.test.cpp b/compiler/luci-interpreter/src/kernels/PadV2.test.cpp
index 1ee741401..41efaff06 100644
--- a/compiler/luci-interpreter/src/kernels/PadV2.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/PadV2.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/PadV2.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,20 +31,23 @@ float GetTolerance(float min, float max) { return (max - min) / 255.0; }
 
 TEST(PadV2, Uint8)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
   std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
   std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
   std::vector<float> constant_values_data{0.5};
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, input_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
-  Tensor constant_values =
-    makeInputTensor<DataType::U8>({1}, quant_param.first, quant_param.second, constant_values_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor constant_values = makeInputTensor<DataType::U8>(
+    {1}, quant_param.first, quant_param.second, constant_values_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   PadV2 kernel(&input_tensor, &paddings_tensor, &constant_values, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data = {
@@ -56,16 +60,21 @@ TEST(PadV2, Uint8)
 
 TEST(PadV2, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   std::vector<float> input_data{1, 2, 3, 4, 5, 6};
   std::vector<int32_t> paddings_data{1, 0, 0, 2, 0, 3, 0, 0};
   std::vector<float> constant_values_data{7};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
-  Tensor constant_values = makeInputTensor<DataType::FLOAT32>({1}, constant_values_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 2, 3, 1}, input_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get());
+  Tensor constant_values =
+    makeInputTensor<DataType::FLOAT32>({1}, constant_values_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   PadV2 kernel(&input_tensor, &paddings_tensor, &constant_values, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
diff --git a/compiler/luci-interpreter/src/kernels/Pow.test.cpp b/compiler/luci-interpreter/src/kernels/Pow.test.cpp
index a414440c9..0e858115d 100644
--- a/compiler/luci-interpreter/src/kernels/Pow.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pow.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Pow.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,7 +27,15 @@ namespace
 
 using namespace testing;
 
-TEST(PowTest, SimplePow)
+class PowTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(PowTest, SimplePow)
 {
   std::initializer_list<int32_t> base_shape = {1, 1, 3, 2};
 
@@ -34,19 +43,22 @@ TEST(PowTest, SimplePow)
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   std::vector<float> test_outputs{0.786f, 1.2838f, 1.043f, 0.7071f, 0.8f, 1.08956f};
 
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(base_shape, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
 }
 
-TEST(PowTest, FloatBroadcastPow)
+TEST_F(PowTest, FloatBroadcastPow)
 {
   std::initializer_list<int32_t> input1_shape = {1, 3};
   std::initializer_list<int32_t> input2_shape = {3, 1};
@@ -56,60 +68,66 @@ TEST(PowTest, FloatBroadcastPow)
   std::vector<float> test_outputs{0.786f,   1.18126f, 0.9791f, 0.6968f, 1.28386f,
                                   0.96888f, 0.6178f,  1.3953f, 0.9587f};
 
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data);
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::FLOAT32>(input1_shape, input1_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::FLOAT32>(input2_shape, input2_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs, 0.0001f));
 }
 
-TEST(PowTest, IntPow)
+TEST_F(PowTest, IntPow)
 {
   std::initializer_list<int32_t> base_shape = {1, 3};
 
   std::vector<int32_t> input_data{2, 3, 4};
   std::vector<int32_t> test_outputs{4, 27, 256};
 
-  Tensor input1_tensor = makeInputTensor<DataType::S32>(base_shape, input_data);
-  Tensor input2_tensor = makeInputTensor<DataType::S32>(base_shape, input_data);
+  Tensor input1_tensor =
+    makeInputTensor<DataType::S32>(base_shape, input_data, _memory_manager.get());
+  Tensor input2_tensor =
+    makeInputTensor<DataType::S32>(base_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(test_outputs));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(base_shape));
 }
 
-TEST(PowTest, Input_Output_Type_NEG)
+TEST_F(PowTest, Input_Output_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f});
-  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f});
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(PowTest, Input_Type_Mismatch_NEG)
+TEST_F(PowTest, Input_Type_Mismatch_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {4});
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.0f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {4}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(PowTest, Invalid_Input_Type_NEG)
+TEST_F(PowTest, Invalid_Input_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
-  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   Pow kernel(&input1_tensor, &input2_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/Relu.cpp b/compiler/luci-interpreter/src/kernels/Relu.cpp
index b5acf1d60..747ec6cc8 100644
--- a/compiler/luci-interpreter/src/kernels/Relu.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu.cpp
@@ -17,7 +17,7 @@
 #include "kernels/Relu.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALRelu.h"
 
 #include <stdexcept>
 
@@ -70,7 +70,7 @@ void Relu::evalFloat() const
   auto output_data = getTensorData<float>(output());
   auto output_shape = getTensorShape(output());
 
-  tflite::optimized_ops::Relu(input_shape, input_data, output_shape, output_data);
+  luci_interpreter_pal::Relu(input_shape, input_data, output_shape, output_data);
 }
 
 void Relu::evalQuantized() const
@@ -85,8 +85,8 @@ void Relu::evalQuantized() const
     std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
   params.quantized_activation_max = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
 
-  tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  luci_interpreter_pal::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                              getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
 void Relu::evalQuantizedS16() const
diff --git a/compiler/luci-interpreter/src/kernels/Relu.test.cpp b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
index 6623a5b77..bd32e3cc9 100644
--- a/compiler/luci-interpreter/src/kernels/Relu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Relu.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(ReluTest, FloatSimple)
+class ReluTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ReluTest, FloatSimple)
 {
   std::vector<float> input_data{
     0.0f, 1.0f,  3.0f,  // Row 1
@@ -39,11 +48,13 @@ TEST(ReluTest, FloatSimple)
     1.0f, 0.0f, 0.0f, // Row 2
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Relu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -51,7 +62,7 @@ TEST(ReluTest, FloatSimple)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(ReluTest, Uint8Quantized)
+TEST_F(ReluTest, Uint8Quantized)
 {
   std::vector<float> input_data{
     0, -6, 2, 4, //
@@ -62,12 +73,13 @@ TEST(ReluTest, Uint8Quantized)
   const float f_max = (127.0 / 128.0) * 8;
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Relu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
@@ -76,7 +88,7 @@ TEST(ReluTest, Uint8Quantized)
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
 }
 
-TEST(ReluTest, Uint8Requantized)
+TEST_F(ReluTest, Uint8Requantized)
 {
   std::vector<float> input_data{
     0, -6, 2, 4, //
@@ -90,14 +102,15 @@ TEST(ReluTest, Uint8Requantized)
   const float out_max = (255.0 / 256.0) * 8;
 
   std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first, quant_input.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_input.first, quant_input.second, input_data, _memory_manager.get());
 
   std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
 
   Relu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
@@ -106,7 +119,7 @@ TEST(ReluTest, Uint8Requantized)
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({0, 0, 2, 4, 3, 0, 7, 1}));
 }
 
-TEST(ReluTest, SInt16)
+TEST_F(ReluTest, SInt16)
 {
   std::vector<float> input_data{
     0, -6, 2, 4, //
@@ -117,33 +130,36 @@ TEST(ReluTest, SInt16)
     3, 0, 7, 1, //
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 4, 1}, 0.5, 0, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 4, 1}, 0.5, 0, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.25, 0);
 
   Relu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(ReluTest, Input_Output_Type_NEG)
+TEST_F(ReluTest, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Relu kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(ReluTest, Invalid_Input_Type_NEG)
+TEST_F(ReluTest, Invalid_Input_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   Relu kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.cpp b/compiler/luci-interpreter/src/kernels/Relu6.cpp
index fa7aa504a..07205ed3a 100644
--- a/compiler/luci-interpreter/src/kernels/Relu6.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu6.cpp
@@ -17,7 +17,7 @@
 #include "kernels/Relu6.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALRelu6.h"
 
 #include <stdexcept>
 
@@ -63,7 +63,7 @@ void Relu6::evalFloat() const
   auto output_data = getTensorData<float>(output());
   auto output_shape = getTensorShape(output());
 
-  tflite::optimized_ops::Relu6(input_shape, input_data, output_shape, output_data);
+  luci_interpreter_pal::Relu6(input_shape, input_data, output_shape, output_data);
 }
 
 void Relu6::evalQuantized() const
@@ -80,8 +80,8 @@ void Relu6::evalQuantized() const
     std::min(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()),
              params.output_offset + static_cast<int32>(roundf(6.f / output()->scale())));
 
-  tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-                               getTensorShape(output()), getTensorData<uint8_t>(output()));
+  luci_interpreter_pal::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+                              getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.test.cpp b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
index fe991389a..af7b3f3db 100644
--- a/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Relu6.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,7 +28,15 @@ namespace
 
 using namespace testing;
 
-TEST(Relu6Test, FloatSimple)
+class Relu6Test : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(Relu6Test, FloatSimple)
 {
   std::vector<float> input_data{
     0.0f, 1.0f,  3.0f,  // Row 1
@@ -39,11 +48,13 @@ TEST(Relu6Test, FloatSimple)
     6.0f, 0.0f, 0.0f, // Row 2
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 3}, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Relu6 kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor),
@@ -51,7 +62,7 @@ TEST(Relu6Test, FloatSimple)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 3}));
 }
 
-TEST(Relu6Test, Uint8Quantized)
+TEST_F(Relu6Test, Uint8Quantized)
 {
   // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
   const float f_min = (-128.0 / 128.0) * 10;
@@ -64,12 +75,13 @@ TEST(Relu6Test, Uint8Quantized)
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Relu6 kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
@@ -79,7 +91,7 @@ TEST(Relu6Test, Uint8Quantized)
               FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
 }
 
-TEST(Relu6Test, Uint8Requantized)
+TEST_F(Relu6Test, Uint8Requantized)
 {
   // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
   const float in_min = (-128.0 / 128.0) * 10;
@@ -94,14 +106,15 @@ TEST(Relu6Test, Uint8Requantized)
   };
 
   std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first, quant_input.second, input_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 4, 1}, quant_input.first, quant_input.second, input_data, _memory_manager.get());
 
   std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
 
   Relu6 kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4, 1}));
@@ -111,22 +124,23 @@ TEST(Relu6Test, Uint8Requantized)
               FloatArrayNear({0, 0, 2, 6, 0, 3, 6, 1}, tolerance));
 }
 
-TEST(Relu6Test, Input_Output_Type_NEG)
+TEST_F(Relu6Test, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Relu6 kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(Relu6Test, Invalid_Input_Type_NEG)
+TEST_F(Relu6Test, Invalid_Input_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   Relu6 kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Reshape.test.cpp b/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
index 38159380f..c2ff3ea1b 100644
--- a/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Reshape.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Reshape.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,37 +27,51 @@ namespace
 
 using namespace testing;
 
+class ReshapeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
 // TODO Test types other than FLOAT32.
 
-TEST(ReshapeTest, Regular)
+TEST_F(ReshapeTest, Regular)
 {
   Shape input_shape{1, 2, 2, 3};
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   Shape shape_shape{2};
   std::vector<int32_t> shape_data{3, 4};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor shape_tensor = makeInputTensor<DataType::S32>(shape_shape, shape_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor shape_tensor =
+    makeInputTensor<DataType::S32>(shape_shape, shape_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Reshape kernel(&input_tensor, &shape_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
 }
 
-TEST(ReshapeTest, UnknownDimension)
+TEST_F(ReshapeTest, UnknownDimension)
 {
   Shape input_shape{2, 1, 2, 3};
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   Shape shape_shape{3};
   std::vector<int32_t> shape_data{2, -1, 2};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor shape_tensor = makeInputTensor<DataType::S32>(shape_shape, shape_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor shape_tensor =
+    makeInputTensor<DataType::S32>(shape_shape, shape_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Reshape kernel(&input_tensor, &shape_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(input_data));
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
index 0e9bcc920..e2ddd6a7b 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALResizeBilinear.h"
 
 namespace luci_interpreter
 {
@@ -56,12 +56,12 @@ void ResizeBilinear::execute() const
   switch (output()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::ResizeBilinear(
+      luci_interpreter_pal::ResizeBilinear(
         op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(size()),
         getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<float>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::ResizeBilinear(
+      luci_interpreter_pal::ResizeBilinear(
         op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
         getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
       break;
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
index 68ef6e6c1..7af20f8c4 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/ResizeBilinear.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -33,8 +34,10 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
            bool align_corners, bool half_pixel_centers)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeBilinearParams params{};
@@ -43,6 +46,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
   ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -60,8 +64,11 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
 {
   // On TFlite example use Uint8 value it self, so this means quant param scale 1.0f and zero
   // point 0.
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, 1.0, 0, input_data);
-  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, 1.0, 0, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0, 0);
 
   ResizeBilinearParams params{};
@@ -70,6 +77,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
 
   ResizeBilinear kernel(&input_tensor, &size_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -152,13 +160,17 @@ TEST(ResizeBilinearTest, HalfPixelCenterUint8Test)
 
 TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
-                                                                        3, 6,  //
-                                                                        9, 12, //
-                                                                        4, 10, //
-                                                                        10, 16 //
-                                                                      });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeBilinearParams params{};
@@ -171,13 +183,17 @@ TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
 
 TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                           3, 6,  //
-                                                                           9, 12, //
-                                                                           4, 10, //
-                                                                           10, 16 //
-                                                                         });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeBilinearParams params{};
@@ -190,13 +206,17 @@ TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
 
 TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                           3, 6,  //
-                                                                           9, 12, //
-                                                                           4, 10, //
-                                                                           10, 16 //
-                                                                         });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeBilinearParams params{};
@@ -209,13 +229,17 @@ TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
 
 TEST(ResizeBilinearTest, InvalidParams_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                           3, 6,  //
-                                                                           9, 12, //
-                                                                           4, 10, //
-                                                                           10, 16 //
-                                                                         });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeBilinearParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
index c52264997..306cefbc2 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
@@ -19,8 +19,8 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+#include "PALResizeNearestNeighbor.h"
 
 namespace luci_interpreter
 {
@@ -61,7 +61,7 @@ void ResizeNearestNeighbor::execute() const
         getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<int32_t>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::ResizeNearestNeighbor(
+      luci_interpreter_pal::ResizeNearestNeighbor(
         op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
         getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
       break;
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
index 0b36a29af..0e9017c78 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/ResizeNearestNeighbor.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -33,8 +34,11 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<int32_t> size_data, std::initializer_list<float> output_data,
            bool align_corners, bool half_pixel_centers)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeNearestNeighborParams params{};
@@ -43,6 +47,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
   ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -58,12 +63,14 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> output_data, bool align_corners,
                     bool half_pixel_centers)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::pair<float, int32_t> quant_param =
     quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
                                 std::max(input_data) > 0 ? std::max(input_data) : 0.f);
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
-  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    input_shape, quant_param.first, quant_param.second, input_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.first);
 
   ResizeNearestNeighborParams params{};
@@ -72,6 +79,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
 
   ResizeNearestNeighbor kernel(&input_tensor, &size_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -151,13 +159,17 @@ TYPED_TEST(ResizeNearestNeighborTest, HalfPixelCenterTest)
 
 TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
-                                                                        3, 6,  //
-                                                                        9, 12, //
-                                                                        4, 10, //
-                                                                        10, 16 //
-                                                                      });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeNearestNeighborParams params{};
@@ -170,13 +182,17 @@ TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
 
 TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                           3, 6,  //
-                                                                           9, 12, //
-                                                                           4, 10, //
-                                                                           10, 16 //
-                                                                         });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeNearestNeighborParams params{};
@@ -189,13 +205,17 @@ TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
 
 TEST(ResizeNearestNeighborTest, SizeDimInvalid_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                           3, 6,  //
-                                                                           9, 12, //
-                                                                           4, 10, //
-                                                                           10, 16 //
-                                                                         });
-  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1},
+                                                           {
+                                                             3, 6,  //
+                                                             9, 12, //
+                                                             4, 10, //
+                                                             10, 16 //
+                                                           },
+                                                           memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   ResizeNearestNeighborParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp b/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp
index 6e1e6c03c..2bd94875b 100644
--- a/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/ReverseV2.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -36,6 +37,8 @@ TYPED_TEST_CASE(ReverseV2Test, DataTypes);
 
 TYPED_TEST(ReverseV2Test, MultiDimensions)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   // TypeParam
   std::vector<TypeParam> input_data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
                                     13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
@@ -47,13 +50,15 @@ TYPED_TEST(ReverseV2Test, MultiDimensions)
                                      17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
   std::vector<int32_t> output_shape{4, 3, 2};
 
-  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
-  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data, memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
 
   ReverseV2 kernel = ReverseV2(&input_tensor, &axis_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
diff --git a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
index b93a04ddd..3c6494232 100644
--- a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Rsqrt.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,11 +30,15 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Rsqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
@@ -58,7 +63,9 @@ TEST(RsqrtTest, SimpleRsqrt)
 
 TEST(RsqrtTest, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   Rsqrt kernel(&input_tensor, &output_tensor);
@@ -67,11 +74,14 @@ TEST(RsqrtTest, Input_Output_Type_NEG)
 
 TEST(RsqrtTest, Invalid_Input_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   Rsqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
index 626521815..37a834a18 100644
--- a/compiler/luci-interpreter/src/kernels/Slice.cpp
+++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
@@ -16,7 +16,7 @@
 
 #include "kernels/Slice.h"
 #include "Utils.h"
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALSlice.h"
 
 #include <cassert>
 #include <cstring>
@@ -131,14 +131,13 @@ void Slice::execute() const
   switch (input()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
-                                   getTensorData<float>(input()), getTensorShape(output()),
-                                   getTensorData<float>(output()));
+      luci_interpreter_pal::Slice(op_params, getTensorShape(input()), getTensorData<float>(input()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
-                                   getTensorData<uint8_t>(input()), getTensorShape(output()),
-                                   getTensorData<uint8_t>(output()));
+      luci_interpreter_pal::Slice(op_params, getTensorShape(input()),
+                                  getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                  getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported input type.");
diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
index a360a29cc..3e0d0b0d7 100644
--- a/compiler/luci-interpreter/src/kernels/Slice.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Slice.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -35,6 +36,8 @@ TYPED_TEST_CASE(SliceTest, DataTypes);
 
 TYPED_TEST(SliceTest, SimpleTest)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
   Shape input_shape{3, 2, 3, 1};
   std::vector<int32_t> begin_data{1, 0, 0, 0};
@@ -44,14 +47,17 @@ TYPED_TEST(SliceTest, SimpleTest)
   std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
   std::vector<int32_t> output_shape{2, 1, 3, 1};
 
-  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
-  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
-  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+  Tensor input_tensor =
+    makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data, memory_manager.get());
 
   Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
 
   Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.cpp b/compiler/luci-interpreter/src/kernels/Softmax.cpp
index 8e29f53ee..c230aaa70 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.cpp
@@ -19,7 +19,7 @@
 #include "kernels/Utils.h"
 
 #include <tensorflow/lite/kernels/internal/reference/softmax.h>
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALSoftmax.h"
 
 #include <stdexcept>
 
@@ -40,10 +40,12 @@ void Softmax::configure()
   LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= 1);
   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S8)
   {
-    LUCI_INTERPRETER_CHECK(output()->zero_point() == 0);
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::S8 || output()->zero_point() == 0);
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::U8 ||
+                           output()->zero_point() == std::numeric_limits<int8_t>::min());
     tflite::SoftmaxParams op_params{};
     op_params.table = _table;
-    tflite::optimized_ops::PopulateSoftmaxLookupTable(&op_params, input()->scale(), params().beta);
+    luci_interpreter_pal::PopulateSoftmaxLookupTable(&op_params, input()->scale(), params().beta);
   }
   output()->resize(input()->shape());
 }
@@ -81,9 +83,9 @@ template <typename T> void Softmax::evalQuantized() const
   op_params.table = const_cast<float *>(_table);
   op_params.zero_point = output()->zero_point();
   op_params.scale = output()->scale();
-
-  tflite::optimized_ops::Softmax(op_params, getTensorShape(input()), getTensorData<T>(input()),
-                                 getTensorShape(output()), getTensorData<T>(output()));
+  luci_interpreter_pal::InitializeParams(&op_params, input()->scale(), params().beta);
+  luci_interpreter_pal::Softmax(op_params, getTensorShape(input()), getTensorData<T>(input()),
+                                getTensorShape(output()), getTensorData<T>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
index c69a2f9cc..9de40b6ec 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Softmax.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -26,46 +27,60 @@ namespace
 
 using namespace testing;
 
-template <typename T>
+template <typename T> constexpr loco::DataType toLocoDataType();
+
+template <> constexpr loco::DataType toLocoDataType<float>() { return loco::DataType::FLOAT32; }
+
+template <> constexpr loco::DataType toLocoDataType<uint8_t>() { return loco::DataType::U8; }
+
+template <> constexpr loco::DataType toLocoDataType<int8_t>() { return loco::DataType::S8; }
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<toLocoDataType<T>()>(input_shape, input_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(toLocoDataType<T>());
 
   SoftmaxParams params{};
   params.beta = 0.1;
 
   Softmax kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), FloatArrayNear(output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
 }
 
-template <>
-void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
-                    std::initializer_list<int32_t> output_shape,
-                    std::initializer_list<float> input_data,
-                    std::initializer_list<float> output_data)
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::pair<float, int32_t> input_quant_param =
-    quantizationParams<uint8_t>(std::min<float>(std::min<float>(input_data), 0.f),
-                                std::max<float>(std::max<float>(input_data), 0.f));
+    quantizationParams<T>(std::min<float>(std::min<float>(input_data), 0.f),
+                          std::max<float>(std::max<float>(input_data), 0.f));
   std::pair<float, int32_t> output_quant_param =
-    quantizationParams<uint8_t>(std::min<float>(std::min<float>(output_data), 0.f),
-                                std::max<float>(std::max<float>(output_data), 0.f));
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
+    quantizationParams<T>(std::min<float>(std::min<float>(output_data), 0.f),
+                          std::max<float>(std::max<float>(output_data), 0.f));
+  Tensor input_tensor = makeInputTensor<toLocoDataType<T>()>(input_shape, input_quant_param.first,
+                                                             input_quant_param.second, input_data,
+                                                             memory_manager.get());
   Tensor output_tensor =
-    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(toLocoDataType<T>(), output_quant_param.first, output_quant_param.second);
 
   SoftmaxParams params{};
   params.beta = 0.1;
 
   Softmax kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
@@ -77,7 +92,7 @@ template <typename T> class SoftmaxTest : public ::testing::Test
 {
 };
 
-using DataTypes = ::testing::Types<float, uint8_t>;
+using DataTypes = ::testing::Types<float, uint8_t, int8_t>;
 TYPED_TEST_CASE(SoftmaxTest, DataTypes);
 
 TYPED_TEST(SoftmaxTest, Simple)
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp
index 2f6a47925..630cd38c4 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp
@@ -18,7 +18,7 @@
 #include "kernels/SpaceToBatchND.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALSpaceToBatchND.h"
 
 #include <stdexcept>
 
@@ -80,7 +80,7 @@ void SpaceToBatchND::execute() const
     tflite::SpaceToBatchParams op_params;
     case DataType::FLOAT32:
       op_params.output_offset = 0;
-      tflite::optimized_ops::SpaceToBatchND(
+      luci_interpreter_pal::SpaceToBatchND(
         op_params, getTensorShape(input()), getTensorData<float>(input()),
         getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
         getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
@@ -88,7 +88,7 @@ void SpaceToBatchND::execute() const
       break;
     case DataType::U8:
       op_params.output_offset = output()->zero_point();
-      tflite::optimized_ops::SpaceToBatchND(
+      luci_interpreter_pal::SpaceToBatchND(
         op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
         getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
         getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
index a6ec6f23f..e06501c8c 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/SpaceToBatchND.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -34,14 +35,19 @@ void Check(std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> block_shape_data,
            std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>(paddings_shape, paddings_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>(paddings_shape, paddings_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -55,17 +61,23 @@ void Check<uint8_t>(
   std::initializer_list<float> input_data, std::initializer_list<int32_t> block_shape_data,
   std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::pair<float, int32_t> input_quant_param =
     quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>(paddings_shape, paddings_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_quant_param.first, input_quant_param.second,
+                                  input_data, memory_manager.get());
+  Tensor block_shape_tensor =
+    makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>(paddings_shape, paddings_data, memory_manager.get());
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, input_quant_param.first, input_quant_param.second);
 
   SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -93,10 +105,13 @@ TYPED_TEST(SpaceToBatchNDTest, Simple)
 
 TEST(SpaceToBatchNDTest, Invalid_Shape_NEG)
 {
-  Tensor input_tensor =
-    makeInputTensor<DataType::FLOAT32>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
-  Tensor paddings_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, memory_manager.get());
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2}, memory_manager.get());
+  Tensor paddings_tensor =
+    makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp b/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
index fc999372a..7c29e8cb0 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
@@ -16,7 +16,7 @@
 
 #include "SpaceToDepth.h"
 #include "Utils.h"
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALSpaceToDepth.h"
 
 namespace luci_interpreter
 {
@@ -61,14 +61,14 @@ void SpaceToDepth::execute() const
   switch (input()->element_type())
   {
     case DataType::FLOAT32:
-      tflite::optimized_ops::SpaceToDepth(op_params, getTensorShape(input()),
-                                          getTensorData<float>(input()), getTensorShape(output()),
-                                          getTensorData<float>(output()));
+      luci_interpreter_pal::SpaceToDepth(op_params, getTensorShape(input()),
+                                         getTensorData<float>(input()), getTensorShape(output()),
+                                         getTensorData<float>(output()));
       break;
     case DataType::U8:
-      tflite::optimized_ops::SpaceToDepth(op_params, getTensorShape(input()),
-                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
-                                          getTensorData<uint8_t>(output()));
+      luci_interpreter_pal::SpaceToDepth(op_params, getTensorShape(input()),
+                                         getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                         getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
index 77b6655dc..735c010b9 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/SpaceToDepth.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -35,10 +36,13 @@ TYPED_TEST_CASE(SpaceToDepthTest, DataTypes);
 
 TYPED_TEST(SpaceToDepthTest, SimpleCase)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   constexpr DataType element_type = getElementType<TypeParam>();
   std::vector<TypeParam> input_data{1, 5, 6, 7, 2, 3, 4, 8};
   Shape input_shape{1, 2, 2, 2};
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
   std::vector<TypeParam> output_data{1, 5, 6, 7, 2, 3, 4, 8};
   std::vector<int32_t> output_shape{1, 1, 1, 8};
   Tensor output_tensor = makeOutputTensor(element_type);
@@ -48,6 +52,7 @@ TYPED_TEST(SpaceToDepthTest, SimpleCase)
 
   SpaceToDepth kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
diff --git a/compiler/luci-interpreter/src/kernels/Split.cpp b/compiler/luci-interpreter/src/kernels/Split.cpp
index 0da0f3779..1a563f307 100644
--- a/compiler/luci-interpreter/src/kernels/Split.cpp
+++ b/compiler/luci-interpreter/src/kernels/Split.cpp
@@ -18,7 +18,7 @@
 
 #include "Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+#include "PALSplit.h"
 
 namespace luci_interpreter
 {
@@ -56,11 +56,11 @@ void Split::execute() const
   params.num_split = _outputs.size();
   params.axis = _axis_value;
 
-#define TF_LITE_SPLIT(scalar)                                                                     \
-  {                                                                                               \
-    VectorOfTensors<scalar, false> all_outputs(_outputs);                                         \
-    tflite::optimized_ops::Split(params, getTensorShape(input()), getTensorData<scalar>(input()), \
-                                 all_outputs.shapes(), all_outputs.data());                       \
+#define TF_LITE_SPLIT(scalar)                                                                    \
+  {                                                                                              \
+    VectorOfTensors<scalar, false> all_outputs(_outputs);                                        \
+    luci_interpreter_pal::Split(params, getTensorShape(input()), getTensorData<scalar>(input()), \
+                                all_outputs.shapes(), all_outputs.data());                       \
   }
 
   switch (input()->element_type())
diff --git a/compiler/luci-interpreter/src/kernels/Split.test.cpp b/compiler/luci-interpreter/src/kernels/Split.test.cpp
index c558928e8..74d57aed3 100644
--- a/compiler/luci-interpreter/src/kernels/Split.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Split.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Split.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -32,9 +33,12 @@ void Check(int axis, int num_splits, std::initializer_list<int32_t> input_shape,
            std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
            std::vector<std::vector<T>> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   constexpr DataType element_type = getElementType<T>();
-  Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis});
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis}, memory_manager.get());
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
 
   std::vector<Tensor> output_tensors;
   output_tensors.reserve(num_splits);
@@ -51,6 +55,10 @@ void Check(int axis, int num_splits, std::initializer_list<int32_t> input_shape,
 
   Split kernel(&axis_tensor, &input_tensor, std::move(output_tensor_ptrs));
   kernel.configure();
+  for (int i = 0; i < num_splits; ++i)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
   kernel.execute();
 
   for (int i = 0; i < num_splits; ++i)
diff --git a/compiler/luci-interpreter/src/kernels/SplitV.cpp b/compiler/luci-interpreter/src/kernels/SplitV.cpp
new file mode 100644
index 000000000..281988272
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SplitV.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SplitV.h"
+
+#include "Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+SplitV::SplitV(const Tensor *input, const Tensor *size_splits, const Tensor *axis,
+               std::vector<Tensor *> outputs)
+  : Kernel({input, size_splits, axis}, std::move(outputs))
+{
+}
+
+void SplitV::configure()
+{
+  assert(axis()->shape().num_elements() == 1);
+  _axis_value = getTensorData<int32_t>(axis())[0];
+  if (_axis_value < 0)
+    _axis_value += input()->shape().num_dims();
+  assert(_axis_value >= 0 && _axis_value < input()->shape().num_dims());
+
+  auto num_split = static_cast<int32_t>(_outputs.size());
+  auto sizes_data = getTensorData<int32_t>(size_splits());
+
+  assert(size_splits()->shape().num_dims() == 1);
+  assert(size_splits()->shape().num_elements() == num_split);
+  assert(std::accumulate(sizes_data, sizes_data + num_split, 0) ==
+         input()->shape().dim(_axis_value));
+
+  auto output_shape = input()->shape();
+  for (int32_t i = 0; i < num_split; ++i)
+  {
+    output_shape.dim(_axis_value) = sizes_data[i];
+    _outputs[i]->resize(output_shape);
+  }
+}
+
+void SplitV::execute() const
+{
+  tflite::SplitParams params{};
+  params.num_split = _outputs.size();
+  params.axis = _axis_value;
+
+#define TF_LITE_SPLIT(scalar)                                                                     \
+  {                                                                                               \
+    VectorOfTensors<scalar, false> all_outputs(_outputs);                                         \
+    tflite::optimized_ops::Split(params, getTensorShape(input()), getTensorData<scalar>(input()), \
+                                 all_outputs.shapes(), all_outputs.data());                       \
+  }
+
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      TF_LITE_SPLIT(float);
+      break;
+    case DataType::U8:
+      TF_LITE_SPLIT(uint8_t);
+      break;
+    case DataType::S16:
+      TF_LITE_SPLIT(int16_t);
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+#undef TF_LITE_SPLIT
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SplitV.h b/compiler/luci-interpreter/src/kernels/SplitV.h
new file mode 100644
index 000000000..92f6288fb
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SplitV.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPLIT_V_H
+#define LUCI_INTERPRETER_KERNELS_SPLIT_V_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SplitV : public Kernel
+{
+public:
+  SplitV(const Tensor *input, const Tensor *size_splits, const Tensor *axis,
+         std::vector<Tensor *> outputs);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *size_splits() const { return _inputs[1]; }
+  const Tensor *axis() const { return _inputs[2]; }
+  Tensor *output(int index) const { return _outputs[index]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  int32_t _axis_value{};
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPLIT_V_H
diff --git a/compiler/luci-interpreter/src/kernels/SplitV.test.cpp b/compiler/luci-interpreter/src/kernels/SplitV.test.cpp
new file mode 100644
index 000000000..aac0567d7
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SplitV.test.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SplitV.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(int axis, std::initializer_list<int32_t> splits_size,
+           std::initializer_list<int32_t> input_shape, std::initializer_list<T> input_data,
+           std::vector<std::vector<T>> output_data)
+{
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  constexpr DataType element_type = getElementType<T>();
+
+  auto num_splits = static_cast<int32_t>(splits_size.size());
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor sizes_tensor =
+    makeInputTensor<DataType::S32>({num_splits}, splits_size, memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({}, {axis}, memory_manager.get());
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensors.emplace_back(makeOutputTensor(element_type));
+  }
+
+  std::vector<Tensor *> output_tensor_ptrs(num_splits);
+  for (int i = 0; i < num_splits; ++i)
+  {
+    output_tensor_ptrs[i] = &output_tensors[i];
+  }
+
+  SplitV kernel(&input_tensor, &sizes_tensor, &axis_tensor, std::move(output_tensor_ptrs));
+  kernel.configure();
+  for (int i = 0; i < num_splits; ++i)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
+  kernel.execute();
+
+  for (int i = 0; i < num_splits; ++i)
+  {
+    auto tmp = extractTensorData<T>(output_tensors[i]);
+    EXPECT_THAT(extractTensorData<T>(output_tensors[i]),
+                ::testing::ElementsAreArray(output_data[i]));
+  }
+}
+
+template <typename T> class SplitVTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t, int16_t>;
+TYPED_TEST_CASE(SplitVTest, DataTypes);
+
+TYPED_TEST(SplitVTest, ThreeDimensional)
+{
+  Check<TypeParam>(
+    /*axis=*/0, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 2, 3, 4, 5, 6, 7, 8, 9},                                             //
+      {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27} //
+    });
+  Check<TypeParam>(
+    /*axis=*/1, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 2, 3, 10, 11, 12, 19, 20, 21},                                 //
+      {4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27} //
+    });
+  Check<TypeParam>(
+    /*axis=*/2, /*splits_size=*/{1, 2}, {3, 3, 3},
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
+    {
+      {1, 4, 7, 10, 13, 16, 19, 22, 25},                                 //
+      {2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27} //
+    });
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
index e40a91e97..96835fbfc 100644
--- a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Sqrt.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,11 +30,15 @@ using namespace testing;
 void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
            std::initializer_list<float> input_data, std::initializer_list<float> output_data)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Sqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(output_data));
@@ -58,20 +63,25 @@ TEST(SqrtTest, SimpleSqrt)
 
 TEST(SqrtTest, Input_Output_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S32);
 
   Sqrt kernel(&input_tensor, &output_tensor);
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(AddTest, Invalid_Input_Type_NEG)
+TEST(SqrtTest, Invalid_Input_Type_NEG)
 {
-  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   Sqrt kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Square.test.cpp b/compiler/luci-interpreter/src/kernels/Square.test.cpp
index 730d6405c..51662dea7 100644
--- a/compiler/luci-interpreter/src/kernels/Square.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Square.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Square.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,13 +30,17 @@ using namespace testing;
 
 TEST(SquareTest, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape{3, 1, 2};
   std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Square kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{1.0, 0.0, 1.0, 121.0, 4.0, 2.0736};
diff --git a/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp b/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp
index a72eaadfa..2819c01e2 100644
--- a/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/SquaredDifference.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -29,15 +30,20 @@ using namespace testing;
 
 TEST(SquaredDifferenceTest, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape{3, 1, 2};
   std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
   std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data1, memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data2, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{4.0, 0.0, 4.0, 1.0, 1.0, 0.0001};
@@ -46,16 +52,21 @@ TEST(SquaredDifferenceTest, Float)
 
 TEST(SquaredDifferenceTest, FloatBroadcast)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape1{3, 1, 2};
   Shape input_shape2{1};
   std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
   std::vector<float> input_data2{1.0};
-  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape1, input_data1);
-  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape2, input_data2);
+  Tensor input_tensor1 =
+    makeInputTensor<DataType::FLOAT32>(input_shape1, input_data1, memory_manager.get());
+  Tensor input_tensor2 =
+    makeInputTensor<DataType::FLOAT32>(input_shape2, input_data2, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{0.0, 1.0, 4.0, 100.0, 9.0, 5.9536};
diff --git a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
index 1c81893b9..d3326fe98 100644
--- a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Squeeze.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -31,8 +32,11 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<T> input_data, std::initializer_list<T> output_data,
            std::initializer_list<int32_t> squeeze_dims)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   SqueezeParams params{};
@@ -40,6 +44,7 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
   Squeeze kernel(&input_tensor, &output_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
index 37b0dd8c5..c6452cdb0 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/strided_slice.h>
 
 #include <stdexcept>
 
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
index 66dffcaf2..399cdebed 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/StridedSlice.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -28,6 +29,8 @@ using namespace testing;
 
 TEST(StridedSliceTest, Float)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape{2, 3, 2};
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   Shape begin_shape{3};
@@ -36,10 +39,13 @@ TEST(StridedSliceTest, Float)
   std::vector<int32_t> end_data{1, 3, 2};
   Shape strides_shape{3};
   std::vector<int32_t> strides_data{1, 1, 1};
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
-  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
-  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data);
-  Tensor strides_tensor = makeInputTensor<DataType::S32>(strides_shape, strides_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data, memory_manager.get());
+  Tensor strides_tensor =
+    makeInputTensor<DataType::S32>(strides_shape, strides_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   StridedSliceParams params{};
@@ -52,6 +58,7 @@ TEST(StridedSliceTest, Float)
   StridedSlice kernel(&input_tensor, &begin_tensor, &end_tensor, &strides_tensor, &output_tensor,
                       params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<int32_t> output_shape{3, 2};
@@ -62,6 +69,8 @@ TEST(StridedSliceTest, Float)
 
 TEST(StridedSliceTest, Uint8)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   Shape input_shape{2, 3, 2};
   std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   Shape begin_shape{3};
@@ -70,10 +79,13 @@ TEST(StridedSliceTest, Uint8)
   std::vector<int32_t> end_data{1, 3, 2};
   Shape strides_shape{3};
   std::vector<int32_t> strides_data{1, 1, 1};
-  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, 1.0f, 0, input_data);
-  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
-  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data);
-  Tensor strides_tensor = makeInputTensor<DataType::S32>(strides_shape, strides_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, 1.0f, 0, input_data, memory_manager.get());
+  Tensor begin_tensor =
+    makeInputTensor<DataType::S32>(begin_shape, begin_data, memory_manager.get());
+  Tensor end_tensor = makeInputTensor<DataType::S32>(end_shape, end_data, memory_manager.get());
+  Tensor strides_tensor =
+    makeInputTensor<DataType::S32>(strides_shape, strides_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1.0f, 0);
 
   StridedSliceParams params{};
@@ -86,6 +98,7 @@ TEST(StridedSliceTest, Uint8)
   StridedSlice kernel(&input_tensor, &begin_tensor, &end_tensor, &strides_tensor, &output_tensor,
                       params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<int32_t> output_shape{3, 2};
diff --git a/compiler/luci-interpreter/src/kernels/Sub.cpp b/compiler/luci-interpreter/src/kernels/Sub.cpp
index 3c7588d62..603c62d0f 100644
--- a/compiler/luci-interpreter/src/kernels/Sub.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sub.cpp
@@ -18,7 +18,9 @@
 #include "kernels/Sub.h"
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+#include "PALSub.h"
+
+#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
 
 #include <stdexcept>
 
@@ -74,9 +76,9 @@ void Sub::evalFloat() const
   }
   else
   {
-    tflite::optimized_ops::Sub(params, getTensorShape(input1()), getTensorData<float>(input1()),
-                               getTensorShape(input2()), getTensorData<float>(input2()),
-                               getTensorShape(output()), getTensorData<float>(output()));
+    luci_interpreter_pal::Sub(params, getTensorShape(input1()), getTensorData<float>(input1()),
+                              getTensorShape(input2()), getTensorData<float>(input2()),
+                              getTensorShape(output()), getTensorData<float>(output()));
   }
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Sub.test.cpp b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
index f560ceb36..c189f4481 100644
--- a/compiler/luci-interpreter/src/kernels/Sub.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Sub.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 #include <algorithm>
 
@@ -33,6 +34,14 @@ using std::vector;
 using std::transform;
 using std::initializer_list;
 
+class SubTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
 // for quantized Add, the error shouldn't exceed step
 float GetTolerance(float min, float max)
 {
@@ -40,7 +49,7 @@ float GetTolerance(float min, float max)
   return kQuantizedStep;
 }
 
-TEST(SubTest, Uint8)
+TEST_F(SubTest, Uint8)
 {
   Shape base_shape = {2, 3, 1, 2};
   vector<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
@@ -62,10 +71,10 @@ TEST(SubTest, Uint8)
   pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
   for (size_t i = 0; i < output_data.size(); ++i)
   {
-    Tensor input1_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
-    Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
-                                                         quant_param.second, test_data);
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
     Tensor output_tensor =
       makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -74,6 +83,7 @@ TEST(SubTest, Uint8)
 
     Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -90,10 +100,10 @@ TEST(SubTest, Uint8)
   // Re-run with exchanged inputs.
   for (size_t i = 0; i < output_data.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
-                                                         quant_param.second, test_data);
-    Tensor input2_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+    Tensor input1_tensor = makeInputTensor<DataType::U8>(
+      test_shapes[i], quant_param.first, quant_param.second, test_data, _memory_manager.get());
+    Tensor input2_tensor = makeInputTensor<DataType::U8>(
+      base_shape, quant_param.first, quant_param.second, base_data, _memory_manager.get());
     Tensor output_tensor =
       makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
@@ -102,6 +112,7 @@ TEST(SubTest, Uint8)
 
     Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -110,7 +121,7 @@ TEST(SubTest, Uint8)
   }
 }
 
-TEST(SubTest, Float)
+TEST_F(SubTest, Float)
 {
   Shape base_shape = {2, 3, 1, 2};
   vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
@@ -130,8 +141,10 @@ TEST(SubTest, Float)
   vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
-    Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>(base_shape, input1_data);
-    Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data);
+    Tensor input1_tensor =
+      makeInputTensor<DataType::FLOAT32>(base_shape, input1_data, _memory_manager.get());
+    Tensor input2_tensor =
+      makeInputTensor<DataType::FLOAT32>(test_shapes[i], input2_data, _memory_manager.get());
     Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
     SubParams params{};
@@ -139,6 +152,7 @@ TEST(SubTest, Float)
 
     Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
     kernel.configure();
+    _memory_manager->allocate_memory(output_tensor);
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
@@ -148,10 +162,10 @@ TEST(SubTest, Float)
   }
 }
 
-TEST(SubTest, Input_Output_Type_NEG)
+TEST_F(SubTest, Input_Output_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
-  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   SubParams params{};
@@ -161,10 +175,10 @@ TEST(SubTest, Input_Output_Type_NEG)
   EXPECT_ANY_THROW(kernel.configure());
 }
 
-TEST(SubTest, Invalid_Input_Type_NEG)
+TEST_F(SubTest, Invalid_Input_Type_NEG)
 {
-  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
-  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get());
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S64);
 
   SubParams params{};
@@ -172,6 +186,7 @@ TEST(SubTest, Invalid_Input_Type_NEG)
 
   Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Tanh.cpp b/compiler/luci-interpreter/src/kernels/Tanh.cpp
index 1c3d1281d..c4fa16912 100644
--- a/compiler/luci-interpreter/src/kernels/Tanh.cpp
+++ b/compiler/luci-interpreter/src/kernels/Tanh.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/tanh.h>
 
 namespace luci_interpreter
 {
diff --git a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
index ef727d6eb..bfae479a9 100644
--- a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Tanh.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -27,18 +28,28 @@ namespace
 
 using namespace testing;
 
-TEST(TanhTest, Float)
+class TanhTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(TanhTest, Float)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
     0, -6, 2,  4, //
     3, -2, 10, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Tanh kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -48,7 +59,7 @@ TEST(TanhTest, Float)
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(TanhTest, Uint8)
+TEST_F(TanhTest, Uint8)
 {
   float kMin = -1;
   float kMax = 127.f / 128.f;
@@ -69,13 +80,15 @@ TEST(TanhTest, Uint8)
     0,  -6, 2, 4, //
     -4, -2, 8, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 6, 4, 1}, input_quant_param.first,
-                                                      input_quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({2, 6, 4, 1}, input_quant_param.first, input_quant_param.second,
+                                  input_data, _memory_manager.get());
   Tensor output_tensor =
     makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   Tanh kernel(&input_tensor, &output_tensor);
   kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   std::vector<float> ref_output_data{
@@ -97,7 +110,7 @@ TEST(TanhTest, Uint8)
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
 }
 
-TEST(TanhTest, InputTypeInvalid_NEG)
+TEST_F(TanhTest, InputTypeInvalid_NEG)
 {
   std::vector<int64_t> input_data{
     0,  -6, 2, 4, //
@@ -113,14 +126,16 @@ TEST(TanhTest, InputTypeInvalid_NEG)
     0,  -6, 2, 4, //
     -4, -2, 8, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::S64>({2, 6, 4, 1}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S64>({2, 6, 4, 1}, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   Tanh kernel(&input_tensor, &output_tensor);
+  _memory_manager->allocate_memory(output_tensor);
   EXPECT_ANY_THROW(kernel.execute());
 }
 
-TEST(TanhTest, InputOutputMismatch_NEG)
+TEST_F(TanhTest, InputOutputMismatch_NEG)
 {
   std::vector<float> input_data{
     0,  -6, 2, 4, //
@@ -136,7 +151,8 @@ TEST(TanhTest, InputOutputMismatch_NEG)
     0,  -6, 2, 4, //
     -4, -2, 8, 1, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 6, 4, 1}, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({2, 6, 4, 1}, input_data, _memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   Tanh kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.cpp b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
index 831dc4247..4d983adda 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.cpp
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
@@ -43,6 +43,11 @@ std::vector<float> dequantizeTensorData(const Tensor &tensor)
     std::vector<uint8_t> data = extractTensorData<uint8_t>(tensor);
     return dequantize(data.data(), data.size(), tensor.scale(), tensor.zero_point());
   }
+  if (tensor.element_type() == DataType::S8)
+  {
+    std::vector<int8_t> data = extractTensorData<int8_t>(tensor);
+    return dequantize(data.data(), data.size(), tensor.scale(), tensor.zero_point());
+  }
   else if (tensor.element_type() == DataType::S16)
   {
     // S16 quantization is symmetric, so zero point should be zero.
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.h b/compiler/luci-interpreter/src/kernels/TestUtils.h
index c4c73d546..1f5a0c308 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.h
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.h
@@ -19,6 +19,7 @@
 #define LUCI_INTERPRETER_KERNELS_TESTUTILS_H
 
 #include "luci_interpreter/core/Tensor.h"
+#include "luci_interpreter/MemoryManager.h"
 
 #include <type_traits>
 
@@ -36,9 +37,11 @@ template <typename T>
 std::vector<T> quantize(const float *data, size_t num_elements, float scale, int32_t zero_point);
 
 template <DataType DT>
-Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeImpl<DT>::Type> &data)
+Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeImpl<DT>::Type> &data,
+                       IMemoryManager *memory_manager)
 {
   Tensor tensor(DT, shape, {}, "");
+  memory_manager->allocate_memory(tensor);
   tensor.writeData(data.data(), data.size() * sizeof(typename DataTypeImpl<DT>::Type));
   return tensor;
 }
@@ -50,16 +53,18 @@ Tensor makeInputTensor(const Shape &shape, const std::vector<typename DataTypeIm
  * @param scale scale of quantized number
  * @param zero_point zero point of quantized number, should be 0 for signed datatypes
  * @param data floating point data for quantization
+ * @param memory_manager memory manager for allocating memory to tensor
  * @return created tensor
  */
 template <DataType DT>
 Tensor makeInputTensor(const Shape &shape, float scale, int32_t zero_point,
-                       const std::vector<float> &data)
+                       const std::vector<float> &data, IMemoryManager *memory_manager)
 {
   using NativeT = typename DataTypeImpl<DT>::Type;
   Tensor tensor(DT, shape, {{scale}, {zero_point}}, "");
   std::vector<NativeT> quantized_data =
     quantize<NativeT>(data.data(), data.size(), scale, zero_point);
+  memory_manager->allocate_memory(tensor);
   tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
   return tensor;
 }
@@ -72,12 +77,13 @@ Tensor makeInputTensor(const Shape &shape, float scale, int32_t zero_point,
  * @param zero_points zero points of quantized number, should be 0 for signed datatypes
  * @param quantize_dimension dimension to apply quantization along. Usually channels/output channels
  * @param data floating point data for quantization
+ * @param memory_manager memory manager for allocating memory to tensor
  * @return created tensor
  */
 template <DataType DT>
 Tensor makeInputTensor(const Shape &shape, const std::vector<float> &scales,
                        const std::vector<int32_t> &zero_points, int quantized_dimension,
-                       const std::vector<float> &data)
+                       const std::vector<float> &data, IMemoryManager *memory_manager)
 {
   using NativeT = typename DataTypeImpl<DT>::Type;
   assert(quantized_dimension < shape.num_dims());
@@ -113,6 +119,7 @@ Tensor makeInputTensor(const Shape &shape, const std::vector<float> &scales,
                             part_quantized_data.end());
     }
   assert(quantized_data.size() == shape.num_elements());
+  memory_manager->allocate_memory(tensor);
   tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
   return tensor;
 }
@@ -127,12 +134,26 @@ template <typename T> constexpr DataType getElementType()
 {
   if (std::is_same<T, float>::value)
     return DataType::FLOAT32;
+  if (std::is_same<T, double>::value)
+    return DataType::FLOAT64;
   if (std::is_same<T, uint8_t>::value)
     return DataType::U8;
+  if (std::is_same<T, uint16_t>::value)
+    return DataType::U16;
+  if (std::is_same<T, uint32_t>::value)
+    return DataType::U32;
+  if (std::is_same<T, uint64_t>::value)
+    return DataType::U64;
+  if (std::is_same<T, int8_t>::value)
+    return DataType::S8;
+  if (std::is_same<T, int16_t>::value)
+    return DataType::S16;
   if (std::is_same<T, int32_t>::value)
     return DataType::S32;
   if (std::is_same<T, int64_t>::value)
     return DataType::S64;
+  if (std::is_same<T, bool>::value)
+    return DataType::BOOL;
   return DataType::Unknown;
 }
 
@@ -156,8 +177,6 @@ std::vector<T> quantize(const float *data, size_t num_elements, float scale, int
   float q_min{}, q_max{};
   if (std::is_signed<T>::value)
   {
-    // For now, assume that signed type implies signed symmetric quantization.
-    assert(zero_point == 0);
     q_min = -std::numeric_limits<T>::max();
     q_max = std::numeric_limits<T>::max();
   }
diff --git a/compiler/luci-interpreter/src/kernels/Transpose.cpp b/compiler/luci-interpreter/src/kernels/Transpose.cpp
index c1a11cdb0..802d87295 100644
--- a/compiler/luci-interpreter/src/kernels/Transpose.cpp
+++ b/compiler/luci-interpreter/src/kernels/Transpose.cpp
@@ -18,7 +18,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/transpose.h>
 
 #include <stdexcept>
 
@@ -37,7 +37,7 @@ void Transpose::configure()
 {
   // Transpose op only supports 1D-4D input arrays.
   int dims = input()->shape().num_dims();
-  const int *perm_data = getTensorData<int32_t>(perm());
+  const int32_t *perm_data = getTensorData<int32_t>(perm());
 
   assert(input()->shape().num_dims() <= 4);
   assert(input()->element_type() == output()->element_type());
@@ -58,8 +58,8 @@ void Transpose::configure()
 void Transpose::execute() const
 {
   tflite::TransposeParams params{};
-  const int *perm_data = getTensorData<int32_t>(perm());
-  const int size = perm()->shape().dim(0);
+  const int32_t *perm_data = getTensorData<int32_t>(perm());
+  const int32_t size = perm()->shape().dim(0);
   params.perm_count = size;
   for (int i = 0; i < size; i++)
     params.perm[i] = perm_data[i];
diff --git a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
index f0a915c35..107179910 100644
--- a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/Transpose.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -31,13 +32,16 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
            std::initializer_list<int32_t> output_shape, std::initializer_list<T> input_data,
            std::initializer_list<int32_t> perm_data, std::initializer_list<T> output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
-  Tensor perm_tensor = makeInputTensor<DataType::S32>(perm_shape, perm_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+  Tensor perm_tensor = makeInputTensor<DataType::S32>(perm_shape, perm_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(element_type);
 
   Transpose kernel(&input_tensor, &perm_tensor, &output_tensor);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
index 0c70756b2..1b5f9d941 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
@@ -19,7 +19,7 @@
 
 #include "kernels/Utils.h"
 
-#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+#include <tensorflow/lite/kernels/internal/reference/transpose_conv.h>
 
 #include <stdexcept>
 
@@ -30,8 +30,10 @@ namespace kernels
 {
 
 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
-                             const Tensor *bias, Tensor *output, const TransposeConvParams &params)
-  : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias}, {output}, params)
+                             const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
+                             const TransposeConvParams &params)
+  : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias},
+                                          {output, scratch_tensor}, params)
 {
 }
 
@@ -74,15 +76,18 @@ void TransposeConv::configure()
 
   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
   {
-    DataType scratch_data_type =
-      input()->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
-    _scratch_tensor =
-      std::make_unique<Tensor>(scratch_data_type, output()->shape(), AffineQuantization{}, "");
+    auto scratch_tensor = getOutputTensors()[1];
+    scratch_tensor->resize(output()->shape());
     const std::vector<double> real_multipliers =
       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
     _quant_multipliers = quantizeMultipliers(real_multipliers);
   }
+  else
+  {
+    auto scratch_tensor = getOutputTensors()[1];
+    scratch_tensor->set_allocatable(false);
+  }
 }
 
 void TransposeConv::execute() const
@@ -111,8 +116,6 @@ void TransposeConv::execute() const
     default:
       throw std::runtime_error("Unsupported type.");
   }
-  if (!!_scratch_tensor)
-    _scratch_tensor->deallocate();
 }
 
 void TransposeConv::evalFloat() const
@@ -148,13 +151,15 @@ void TransposeConv::evalQuantized() const
   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
 
+  auto scratch_tensor = getOutputTensors()[1];
+
   tflite::reference_ops::TransposeConv(op_params,                                                //
                                        getTensorShape(input()), getTensorData<uint8>(input()),   //
                                        getTensorShape(filter()), getTensorData<uint8>(filter()), //
                                        getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
                                        getTensorShape(output()), getTensorData<uint8>(output()), //
                                        tflite::RuntimeShape(), nullptr,                          //
-                                       getTensorData<int32_t>(_scratch_tensor.get()));
+                                       getTensorData<int32_t>(scratch_tensor));
 }
 
 void TransposeConv::evalQuantizedPerChannel() const
@@ -163,7 +168,9 @@ void TransposeConv::evalQuantizedPerChannel() const
   const auto *filter_data = getTensorData<uint8_t>(filter());
   const auto *bias_data = getTensorData<int32_t>(bias());
   auto *output_data = getTensorData<uint8_t>(output());
-  auto *scratch_data = getTensorData<int32_t>(_scratch_tensor.get());
+
+  auto scratch_tensor = getOutputTensors()[1];
+  auto *scratch_data = getTensorData<int32_t>(scratch_tensor);
 
   const Shape &input_shape = input()->shape();
   const Shape &filter_shape = filter()->shape();
@@ -186,7 +193,7 @@ void TransposeConv::evalQuantizedPerChannel() const
   int32_t activation_max{};
   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 
-  std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int32_t));
+  std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int32_t));
 
   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
   for (int32_t batch = 0; batch < batches; ++batch)
@@ -255,7 +262,9 @@ void TransposeConv::evalQuantizedS16() const
   const auto *filter_data = getTensorData<int16_t>(filter());
   const auto *bias_data = getTensorData<int64_t>(bias());
   auto *output_data = getTensorData<int16_t>(output());
-  auto *scratch_data = getTensorData<int64_t>(_scratch_tensor.get());
+
+  auto scratch_tensor = getOutputTensors()[1];
+  auto *scratch_data = getTensorData<int64_t>(scratch_tensor);
 
   const Shape &input_shape = input()->shape();
   const Shape &filter_shape = filter()->shape();
@@ -278,7 +287,7 @@ void TransposeConv::evalQuantizedS16() const
   int32_t activation_max{};
   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 
-  std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int64_t));
+  std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int64_t));
 
   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
   for (int32_t batch = 0; batch < batches; ++batch)
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.h b/compiler/luci-interpreter/src/kernels/TransposeConv.h
index 2e0beece8..cea0cf3c7 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.h
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.h
@@ -31,7 +31,8 @@ class TransposeConv : public KernelWithParams<TransposeConvParams>
 {
 public:
   TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
-                const Tensor *bias, Tensor *output, const TransposeConvParams &params);
+                const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
+                const TransposeConvParams &params);
 
   ~TransposeConv();
 
@@ -51,8 +52,6 @@ private:
   void evalQuantizedS16() const;
 
 private:
-  std::unique_ptr<Tensor> _scratch_tensor;
-
   int32_t _padding_height{};
   int32_t _padding_width{};
   // The scaling factor from input to output (aka the 'real multiplier') can
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 9bcb015c1..4856e1b87 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -16,6 +16,7 @@
 
 #include "kernels/TransposeConv.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -35,11 +36,18 @@ void Check(std::initializer_list<int32_t> output_shape_shape,
            std::initializer_list<T> output_data, luci::Padding padding, int32_t stride_height,
            int32_t stride_width)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   constexpr DataType element_type = getElementType<T>();
   Tensor output_shape_tensor =
-    makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data);
-  Tensor weight_tensor = makeInputTensor<element_type>(weight_shape, weight_data);
-  Tensor input_data_tensor = makeInputTensor<element_type>(input_shape, input_data);
+    makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data, memory_manager.get());
+  Tensor weight_tensor =
+    makeInputTensor<element_type>(weight_shape, weight_data, memory_manager.get());
+  Tensor input_data_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
+
+  DataType scratch_data_type = element_type == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
   Tensor output_tensor = makeOutputTensor(element_type);
 
   TransposeConvParams params{};
@@ -49,17 +57,22 @@ void Check(std::initializer_list<int32_t> output_shape_shape,
 
   if (bias_data.size() != 0)
   {
-    Tensor bias_tensor = makeInputTensor<getElementType<B>()>(bias_shape, bias_data);
+    Tensor bias_tensor =
+      makeInputTensor<getElementType<B>()>(bias_shape, bias_data, memory_manager.get());
     TransposeConv kernel(&output_shape_tensor, &weight_tensor, &input_data_tensor, &bias_tensor,
-                         &output_tensor, params);
+                         &output_tensor, &scratch_tensor, params);
     kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    memory_manager->allocate_memory(scratch_tensor);
     kernel.execute();
   }
   else
   {
     TransposeConv kernel(&output_shape_tensor, &weight_tensor, &input_data_tensor, nullptr,
-                         &output_tensor, params);
+                         &output_tensor, &scratch_tensor, params);
     kernel.configure();
+    memory_manager->allocate_memory(output_tensor);
+    memory_manager->allocate_memory(scratch_tensor);
     kernel.execute();
   }
   EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
@@ -114,6 +127,8 @@ TEST(TransposeConvTest, SimpleBiasTest)
 
 TEST(TransposeConvTest, UInt8)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::vector<float> input_data{1, 2, 3, 4};
   std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
   std::vector<float> bias_data{3, 4};
@@ -131,23 +146,30 @@ TEST(TransposeConvTest, UInt8)
   auto filter_quant = quantizationParams<uint8_t>(-24.0, 39.75); // s = 1 / 4, zp = 96
   auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
 
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first, input_quant.second, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::U8>({2, 3, 3, 1}, filter_quant.first,
-                                                       filter_quant.second, filter_data);
-  Tensor bias_tensor =
-    makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first, 0, bias_data);
-  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 1}, input_quant.first, input_quant.second, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(
+    {2, 3, 3, 1}, filter_quant.first, filter_quant.second, filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first,
+                                                      0, bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
 
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
   TransposeConvParams params{};
   params.padding = Padding::VALID;
   params.stride_height = 2;
   params.stride_width = 2;
 
   TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
-                       &output_tensor, params);
+                       &output_tensor, &scratch_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
@@ -156,6 +178,8 @@ TEST(TransposeConvTest, UInt8)
 
 TEST(TransposeConvTest, UInt8_CWQ)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   const int32_t output_channels = 2;
   std::vector<float> input_data{1, 2, 3, 4};
   std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
@@ -190,23 +214,30 @@ TEST(TransposeConvTest, UInt8_CWQ)
     bias_scales.push_back(filter_quant_params[i].first * input_quant.first);
   std::vector<int32_t> zerop(output_channels, 0);
 
-  Tensor input_tensor =
-    makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first, input_quant.second, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::U8>({output_channels, 3, 3, 1}, filter_scales,
-                                                       filter_zerops, 0, filter_data);
-  Tensor bias_tensor =
-    makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
-  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor input_tensor = makeInputTensor<DataType::U8>(
+    {1, 2, 2, 1}, input_quant.first, input_quant.second, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::U8>(
+    {output_channels, 3, 3, 1}, filter_scales, filter_zerops, 0, filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0,
+                                                      bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
 
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
   TransposeConvParams params{};
   params.padding = Padding::VALID;
   params.stride_height = 2;
   params.stride_width = 2;
 
   TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
-                       &output_tensor, params);
+                       &output_tensor, &scratch_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
@@ -215,6 +246,8 @@ TEST(TransposeConvTest, UInt8_CWQ)
 
 TEST(TransposeConvTest, SInt16)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   std::vector<float> input_data{1, 2, 3, 4};
   std::vector<float> filter_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
   std::vector<float> bias_data{3, 4};
@@ -227,20 +260,30 @@ TEST(TransposeConvTest, SInt16)
     42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
   };
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 2, 1}, 0.25, 0, input_data);
-  Tensor filter_tensor = makeInputTensor<DataType::S16>({2, 3, 3, 1}, 0.2, 0, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>({2}, 0.25 * 0.2, 0, bias_data);
-  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>({1, 2, 2, 1}, 0.25, 0, input_data, memory_manager.get());
+  Tensor filter_tensor =
+    makeInputTensor<DataType::S16>({2, 3, 3, 1}, 0.2, 0, filter_data, memory_manager.get());
+  Tensor bias_tensor =
+    makeInputTensor<DataType::S64>({2}, 0.25 * 0.2, 0, bias_data, memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
   TransposeConvParams params{};
   params.padding = Padding::VALID;
   params.stride_height = 2;
   params.stride_width = 2;
 
   TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
-                       &output_tensor, params);
+                       &output_tensor, &scratch_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
@@ -249,6 +292,8 @@ TEST(TransposeConvTest, SInt16)
 
 TEST(TransposeConvTest, SInt16_CWQ_weights)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
   const int output_channels = 2;
   const Shape input_shape{1, 2, 2, 1};
   const Shape filter_shape{output_channels, 3, 3, 1};
@@ -273,21 +318,30 @@ TEST(TransposeConvTest, SInt16_CWQ_weights)
   std::vector<float> bias_scales{filter_scales[0] * input_scale, filter_scales[1] * input_scale};
   const std::vector<int32_t> zerop(2, 0);
 
-  Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
-  Tensor filter_tensor =
-    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
-  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
-  Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data, memory_manager.get());
+  Tensor filter_tensor = makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0,
+                                                        filter_data, memory_manager.get());
+  Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data,
+                                                      memory_manager.get());
+  Tensor output_shape_tensor =
+    makeInputTensor<DataType::S32>({4}, output_shape_data, memory_manager.get());
   Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
 
+  DataType scratch_data_type =
+    input_tensor.element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+  Tensor scratch_tensor(scratch_data_type, Shape({}), {}, "");
+
   TransposeConvParams params{};
   params.padding = Padding::VALID;
   params.stride_height = 2;
   params.stride_width = 2;
 
   TransposeConv kernel(&output_shape_tensor, &filter_tensor, &input_tensor, &bias_tensor,
-                       &output_tensor, params);
+                       &output_tensor, &scratch_tensor, params);
   kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  memory_manager->allocate_memory(scratch_tensor);
   kernel.execute();
 
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape_data));
diff --git a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
index 6d611e12e..4f22c9f30 100644
--- a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
@@ -17,6 +17,7 @@
 
 #include "kernels/Unpack.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -32,10 +33,12 @@ void Check(int axis, Shape input_shape, std::initializer_list<T> input_data,
            const std::vector<std::initializer_list<int32_t>> &exp_output_shape,
            std::vector<std::initializer_list<T>> exp_output_data)
 {
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
   constexpr DataType element_type = getElementType<T>();
   const int num_outputs = input_shape.dim(axis < 0 ? axis + input_shape.num_dims() : axis);
 
-  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor input_tensor =
+    makeInputTensor<element_type>(input_shape, input_data, memory_manager.get());
   std::vector<Tensor> output_tensors;
   output_tensors.reserve(num_outputs);
   for (int i = 0; i < num_outputs; ++i)
@@ -54,6 +57,10 @@ void Check(int axis, Shape input_shape, std::initializer_list<T> input_data,
 
   Unpack kernel(&input_tensor, std::move(output_tensor_ptrs), params);
   kernel.configure();
+  for (int i = 0; i < num_outputs; i++)
+  {
+    memory_manager->allocate_memory(output_tensors[i]);
+  }
   kernel.execute();
 
   for (int i = 0; i < num_outputs; ++i)
diff --git a/compiler/luci-interpreter/src/kernels/Utils.cpp b/compiler/luci-interpreter/src/kernels/Utils.cpp
index 83faa7d7f..6e83e37f6 100644
--- a/compiler/luci-interpreter/src/kernels/Utils.cpp
+++ b/compiler/luci-interpreter/src/kernels/Utils.cpp
@@ -91,7 +91,7 @@ static void calculateActivationRangeQuantizedImpl(Activation activation, int32_t
 void calculateActivationRangeQuantized(Activation activation, const Tensor *output,
                                        int32_t *activation_min, int32_t *activation_max)
 {
-  // For now, assume that signed type implies signed symmetric quantization.
+  assert(output->zero_points().size() == 1);
   int32_t qmin{};
   int32_t qmax{};
   switch (output->element_type())
@@ -101,11 +101,11 @@ void calculateActivationRangeQuantized(Activation activation, const Tensor *outp
       qmax = std::numeric_limits<uint8_t>::max();
       break;
     case DataType::S8:
-      assert(output->zero_point() == 0);
       qmin = -std::numeric_limits<int8_t>::max();
       qmax = std::numeric_limits<int8_t>::max();
       break;
     case DataType::S16:
+      // For now, assume that signed int16 type implies signed symmetric quantization.
       assert(output->zero_point() == 0);
       qmin = -std::numeric_limits<int16_t>::max();
       qmax = std::numeric_limits<int16_t>::max();
diff --git a/compiler/luci-interpreter/src/kernels/While.cpp b/compiler/luci-interpreter/src/kernels/While.cpp
index d4676467d..153bd1a99 100644
--- a/compiler/luci-interpreter/src/kernels/While.cpp
+++ b/compiler/luci-interpreter/src/kernels/While.cpp
@@ -49,6 +49,13 @@ void copy(const std::vector<Tensor *> &src, const std::vector<Tensor *> &dst)
   copy(const_src, dst);
 }
 
+// TODO: Think about how allocate memory for output in main graph
+void configureTensorsAllocations(const std::vector<Tensor *> &tensors, RuntimeGraph *run_graph)
+{
+  for (auto tensor : tensors)
+    run_graph->configureAllocations(tensor);
+}
+
 } // namespace
 
 While::While(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs,
@@ -78,11 +85,15 @@ void While::execute() const
   const auto &cond_inputs = _cond_graph->getInputTensors();
   const auto &cond_outputs = _cond_graph->getOutputTensors();
 
+  configureTensorsAllocations(cond_inputs, _cond_graph);
+
   copy(getInputTensors(), cond_inputs);
 
   const auto &body_inputs = _body_graph->getInputTensors();
   const auto &body_outputs = _body_graph->getOutputTensors();
 
+  configureTensorsAllocations(body_inputs, _body_graph);
+
   while (true)
   {
     _cond_graph->execute();
diff --git a/compiler/luci-interpreter/src/kernels/While.test.cpp b/compiler/luci-interpreter/src/kernels/While.test.cpp
index a066d2c12..cb8f89130 100644
--- a/compiler/luci-interpreter/src/kernels/While.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/While.test.cpp
@@ -20,6 +20,7 @@
 #include "kernels/Less.h"
 #include "kernels/While.h"
 #include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
 
 namespace luci_interpreter
 {
@@ -30,14 +31,18 @@ namespace
 
 using namespace testing;
 
-RuntimeGraph *buildCondSubgraph(RuntimeModule *module, DataType dtype, Tensor *input_cond)
+RuntimeGraph *buildCondSubgraph(RuntimeModule *module, DataType dtype, Tensor *input_cond,
+                                IMemoryManager *memory_manager)
 {
-  RuntimeGraph *graph = module->addGraph();
+  RuntimeGraph *graph = module->addGraph(memory_manager);
   Tensor *input =
     graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
   Tensor *output =
     graph->addTensor(std::make_unique<Tensor>(DataType::BOOL, Shape{}, AffineQuantization{}, ""));
 
+  memory_manager->allocate_memory(*input);
+  memory_manager->allocate_memory(*output);
+
   graph->setInputTensors({input});
   graph->setOutputTensors({output});
 
@@ -46,14 +51,18 @@ RuntimeGraph *buildCondSubgraph(RuntimeModule *module, DataType dtype, Tensor *i
   return graph;
 }
 
-RuntimeGraph *buildBodySubgraph(RuntimeModule *module, DataType dtype, Tensor *input_add)
+RuntimeGraph *buildBodySubgraph(RuntimeModule *module, DataType dtype, Tensor *input_add,
+                                IMemoryManager *memory_manager)
 {
-  RuntimeGraph *graph = module->addGraph();
+  RuntimeGraph *graph = module->addGraph(memory_manager);
   Tensor *input =
     graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
   Tensor *output =
     graph->addTensor(std::make_unique<Tensor>(dtype, Shape{}, AffineQuantization{}, ""));
 
+  memory_manager->allocate_memory(*input);
+  memory_manager->allocate_memory(*output);
+
   graph->setInputTensors({input});
   graph->setOutputTensors({output});
 
@@ -66,18 +75,22 @@ RuntimeGraph *buildBodySubgraph(RuntimeModule *module, DataType dtype, Tensor *i
 
 TEST(WhileTest, FloatLoop10)
 {
-  Tensor input = makeInputTensor<DataType::FLOAT32>({1}, {1});
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+  Tensor input = makeInputTensor<DataType::FLOAT32>({1}, {1}, memory_manager.get());
   Tensor output = makeOutputTensor(DataType::FLOAT32);
 
-  Tensor input_cond = makeInputTensor<DataType::FLOAT32>({1}, {10});
-  Tensor input_add = makeInputTensor<DataType::FLOAT32>({1}, {1});
+  Tensor input_cond = makeInputTensor<DataType::FLOAT32>({1}, {10}, memory_manager.get());
+  Tensor input_add = makeInputTensor<DataType::FLOAT32>({1}, {1}, memory_manager.get());
 
   RuntimeModule module(nullptr);
-  RuntimeGraph *cond_graph = buildCondSubgraph(&module, DataType::FLOAT32, &input_cond);
-  RuntimeGraph *body_graph = buildBodySubgraph(&module, DataType::FLOAT32, &input_add);
+  RuntimeGraph *cond_graph =
+    buildCondSubgraph(&module, DataType::FLOAT32, &input_cond, memory_manager.get());
+  RuntimeGraph *body_graph =
+    buildBodySubgraph(&module, DataType::FLOAT32, &input_add, memory_manager.get());
 
   While kernel({&input}, {&output}, cond_graph, body_graph);
   kernel.configure();
+  memory_manager->allocate_memory(output);
   kernel.execute();
 
   EXPECT_THAT(extractTensorData<float>(output), FloatArrayNear({10}));
diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
index 782f46761..974283a2f 100644
--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
@@ -7,14 +7,23 @@ set(SOURCES
     KernelBuilder.cpp
     ModuleLoader.h
     ModuleLoader.cpp
-    RuntimeToIR.h)
+    RuntimeToIR.h
+    nodes/Builders.h)
 
-add_library(luci_interpreter_loader STATIC ${SOURCES})
-set_target_properties(luci_interpreter_loader PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
-target_link_libraries(luci_interpreter_loader
-    PUBLIC luci_lang luci_interpreter_core
-    PRIVATE luci_interpreter_kernels nncc_common)
+# include kernel specific builders
+macro(REGISTER_KERNEL NODE)
+  list(APPEND SOURCES "nodes/${NODE}.cpp")
+endmacro(REGISTER_KERNEL)
+include(${KERNEL_REGISTER_FILE})
+
+add_library(${LUCI_INTERPRETER_LOADER} STATIC ${SOURCES})
+set_target_properties(${LUCI_INTERPRETER_LOADER} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_PAL_DIR}")
+target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}")
+
+target_link_libraries(${LUCI_INTERPRETER_LOADER}
+        PUBLIC luci_lang ${LUCI_INTERPRETER_CORE}
+        PRIVATE ${LUCI_INTERPRETER_KERNELS} nncc_common)
 
 if(NOT ENABLE_TEST)
   return()
@@ -24,5 +33,5 @@ nnas_find_package(GTest REQUIRED)
 
 set(TEST_SOURCES KernelBuilder.test.cpp)
 
-GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
-target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
+GTest_AddTest(${LUCI_INTERPRETER_LOADER}_test ${TEST_SOURCES})
+target_link_libraries(${LUCI_INTERPRETER_LOADER}_test ${LUCI_INTERPRETER_LOADER})
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index ee45ad747..b55e7c504 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -57,6 +57,8 @@ const void *getNodeData(const luci::CircleConst *node, size_t *data_size)
       return getNodeDataImpl<DataType::U8>(node, data_size);
     case DataType::FLOAT32:
       return getNodeDataImpl<DataType::FLOAT32>(node, data_size);
+    case DataType::S8:
+      return getNodeDataImpl<DataType::S8>(node, data_size);
     case DataType::S16:
       return getNodeDataImpl<DataType::S16>(node, data_size);
     case DataType::S32:
@@ -82,6 +84,7 @@ bool isExecutableNode(const luci::CircleNode *node)
     // The following nodes denote outputs of multiple-output nodes.
     case luci::CircleOpcode::CIRCLEIFOUT:
     case luci::CircleOpcode::CIRCLESPLITOUT:
+    case luci::CircleOpcode::CIRCLESPLITVOUT:
     case luci::CircleOpcode::CIRCLEUNPACKOUT:
     case luci::CircleOpcode::CIRCLEWHILEOUT:
       return false;
@@ -112,9 +115,10 @@ bool isTensorProducingNode(const luci::CircleNode *node)
 GraphLoader::GraphLoader(
   const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
   const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-  std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+  std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor, IMemoryManager *memory_manager)
   : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
-    _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+    _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor),
+    _memory_manager(memory_manager)
 {
 }
 
@@ -156,7 +160,10 @@ void GraphLoader::loadTensors()
       size_t data_size{};
       const void *const_data = getNodeData(const_node, &data_size);
       if (const_data != nullptr)
+      {
+        _memory_manager->allocate_memory(*tensor);
         tensor->writeData(const_data, data_size);
+      }
     }
 
     _node_to_tensor.emplace(node, tensor.get());
@@ -173,6 +180,7 @@ void GraphLoader::initInputOutputTensors() const
   for (size_t i = 0; i < input_nodes.size(); ++i)
   {
     input_tensors[i] = _node_to_tensor.at(input_nodes[i]);
+    _memory_manager->allocate_memory(*input_tensors[i]);
   }
   _runtime_graph->setInputTensors(input_tensors);
 
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
index 89c5bcad7..fe066ecf8 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
@@ -19,6 +19,7 @@
 
 #include "core/RuntimeGraph.h"
 #include "loader/RuntimeToIR.h"
+#include "luci_interpreter/MemoryManager.h"
 
 #include <loco/IR/Graph.h>
 
@@ -32,7 +33,8 @@ class GraphLoader
 public:
   GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
               const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-              std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+              std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+              IMemoryManager *memory_manager);
 
   void loadTensors();
   void initInputOutputTensors() const;
@@ -42,6 +44,7 @@ private:
   const loco::Graph *_graph;
   RuntimeGraph *_runtime_graph;
   RuntimeToIR &_runtime_to_ir;
+  IMemoryManager *_memory_manager;
 
   const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
   std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 4cb8bd691..8483a9a3d 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -15,1240 +15,90 @@
  */
 
 #include "loader/KernelBuilder.h"
-
-#include "kernels/Add.h"
-#include "kernels/ArgMax.h"
-#include "kernels/AveragePool2D.h"
-#include "kernels/BatchToSpaceND.h"
-#include "kernels/Cast.h"
-#include "kernels/Concatenation.h"
-#include "kernels/Conv2D.h"
-#include "kernels/DepthToSpace.h"
-#include "kernels/DepthwiseConv2D.h"
-#include "kernels/Div.h"
-#include "kernels/Elu.h"
-#include "kernels/Exp.h"
-#include "kernels/Floor.h"
-#include "kernels/FloorDiv.h"
-#include "kernels/Equal.h"
-#include "kernels/FullyConnected.h"
-#include "kernels/Greater.h"
-#include "kernels/GreaterEqual.h"
-#include "kernels/If.h"
-#include "kernels/InstanceNorm.h"
-#include "kernels/L2Normalize.h"
-#include "kernels/L2Pool2D.h"
-#include "kernels/LeakyRelu.h"
-#include "kernels/Less.h"
-#include "kernels/LessEqual.h"
-#include "kernels/LocalResponseNormalization.h"
-#include "kernels/LogicalAnd.h"
-#include "kernels/LogicalNot.h"
-#include "kernels/LogicalOr.h"
-#include "kernels/Logistic.h"
-#include "kernels/LogSoftmax.h"
-#include "kernels/Maximum.h"
-#include "kernels/MaxPool2D.h"
-#include "kernels/Mean.h"
-#include "kernels/Minimum.h"
-#include "kernels/MirrorPad.h"
-#include "kernels/Mul.h"
-#include "kernels/Neg.h"
-#include "kernels/NotEqual.h"
-#include "kernels/Pack.h"
-#include "kernels/Pad.h"
-#include "kernels/PadV2.h"
-#include "kernels/Pow.h"
-#include "kernels/PRelu.h"
-#include "kernels/Relu.h"
-#include "kernels/Relu6.h"
-#include "kernels/Reshape.h"
-#include "kernels/ResizeBilinear.h"
-#include "kernels/ResizeNearestNeighbor.h"
-#include "kernels/ReverseV2.h"
-#include "kernels/Rsqrt.h"
-#include "kernels/Slice.h"
-#include "kernels/Softmax.h"
-#include "kernels/SpaceToBatchND.h"
-#include "kernels/SpaceToDepth.h"
-#include "kernels/Split.h"
-#include "kernels/StridedSlice.h"
-#include "kernels/Sqrt.h"
-#include "kernels/Square.h"
-#include "kernels/SquaredDifference.h"
-#include "kernels/Squeeze.h"
-#include "kernels/Sub.h"
-#include "kernels/Tanh.h"
-#include "kernels/Unpack.h"
-#include "kernels/Transpose.h"
-#include "kernels/TransposeConv.h"
-#include "kernels/While.h"
+#include "loader/nodes/Builders.h"
 
 #include <stdexcept>
 
-namespace
-{
-
-template <typename CircleNodeOut>
-std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode *node)
-{
-  std::vector<const CircleNodeOut *> output_nodes;
-  for (const loco::Node *loco_node : loco::succs(node))
-  {
-    output_nodes.push_back(loco::must_cast<const CircleNodeOut *>(loco_node));
-  }
-  std::sort(output_nodes.begin(), output_nodes.end(),
-            [](const CircleNodeOut *node1, const CircleNodeOut *node2) {
-              return node1->index() < node2->index();
-            });
-  return {output_nodes.cbegin(), output_nodes.cend()};
-}
-
-} // namespace
-
 namespace luci_interpreter
 {
 
-// TODO move to anonymous namespace
-enum class KB
+#define CIRCLE_NODE(OPCODE, CLASS) CLASS,
+#define CIRCLE_VNODE(OPCODE, CLASS) CLASS,
+
+// This enum is auxiliary.
+// It is duplicate of luci::CircleOpcode but initialized with CLASS instead of OPCODE,
+// because list of target operators is in format of CLASS names
+enum class BuilderId
 {
-  ABC,
-  DEF,
-  GHIJ,
-  KLMN,
-  OPQR,
-  STUV,
-  WXYZ,
+#include <luci/IR/CircleNodes.lst>
+  Size // casts to count of values in BuilderId enum
 };
 
-#define DECLARE_VISIT(CLASS) std::unique_ptr<Kernel> visit(const luci::CLASS *) override
+#undef CIRCLE_VNODE
+#undef CIRCLE_NODE
 
-template <KB kb> class KernelBuilderLet;
+/**
+ * @brief Registry of kernel builders
+ *
+ * This class contains mapping from Opcodes to kernel builder functions
+ */
 
-template <>
-class KernelBuilderLet<KB::ABC> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                  public KernelBuilderHelper
+class KernelBuilderRegistry
 {
 public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
-  {
-  }
+  using KernelBuilderFunc = std::unique_ptr<Kernel>(const luci::CircleNode *,
+                                                    KernelBuilderHelper &);
 
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleAdd);
-  DECLARE_VISIT(CircleArgMax);
-  DECLARE_VISIT(CircleAveragePool2D);
-  DECLARE_VISIT(CircleBatchToSpaceND);
-  DECLARE_VISIT(CircleCast);
-  DECLARE_VISIT(CircleConcatenation);
-  DECLARE_VISIT(CircleConst);
-  DECLARE_VISIT(CircleConv2D);
-};
-
-template <>
-class KernelBuilderLet<KB::DEF> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                  public KernelBuilderHelper
-{
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
+  KernelBuilderRegistry() : _operator_builders(size_t(BuilderId::Size), nullptr)
   {
-  }
+#define REGISTER_KERNEL(name) \
+  register_kernel_builder(BuilderId::Circle##name, build_kernel_Circle##name);
 
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleDepthToSpace);
-  DECLARE_VISIT(CircleDepthwiseConv2D);
-  DECLARE_VISIT(CircleDiv);
-  DECLARE_VISIT(CircleElu);
-  DECLARE_VISIT(CircleEqual);
-  DECLARE_VISIT(CircleExp);
-  DECLARE_VISIT(CircleFloor);
-  DECLARE_VISIT(CircleFloorDiv);
-  DECLARE_VISIT(CircleFullyConnected);
-};
+#include "KernelsToBuild.lst"
 
-template <>
-class KernelBuilderLet<KB::GHIJ> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                   public KernelBuilderHelper
-{
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
-  {
+#undef REGISTER_KERNEL
   }
 
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleGreater);
-  DECLARE_VISIT(CircleGreaterEqual);
-  DECLARE_VISIT(CircleIf);
-  DECLARE_VISIT(CircleInput);
-  DECLARE_VISIT(CircleInstanceNorm);
-};
-
-template <>
-class KernelBuilderLet<KB::KLMN> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                   public KernelBuilderHelper
-{
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
+  KernelBuilderFunc *get_kernel_builder_func(luci::CircleOpcode opcode) const
   {
+    return _operator_builders.at(size_t(opcode));
   }
 
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleL2Normalize);
-  DECLARE_VISIT(CircleL2Pool2D);
-  DECLARE_VISIT(CircleLeakyRelu);
-  DECLARE_VISIT(CircleLess);
-  DECLARE_VISIT(CircleLessEqual);
-  DECLARE_VISIT(CircleLocalResponseNormalization);
-  DECLARE_VISIT(CircleLogSoftmax);
-  DECLARE_VISIT(CircleLogicalAnd);
-  DECLARE_VISIT(CircleLogicalNot);
-  DECLARE_VISIT(CircleLogicalOr);
-  DECLARE_VISIT(CircleLogistic);
-  DECLARE_VISIT(CircleMaxPool2D);
-  DECLARE_VISIT(CircleMaximum);
-  DECLARE_VISIT(CircleMean);
-  DECLARE_VISIT(CircleMinimum);
-  DECLARE_VISIT(CircleMirrorPad);
-  DECLARE_VISIT(CircleMul);
-  DECLARE_VISIT(CircleNeg);
-  DECLARE_VISIT(CircleNotEqual);
-};
+private:
+  std::vector<KernelBuilderFunc *> _operator_builders;
 
-template <>
-class KernelBuilderLet<KB::OPQR> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                   public KernelBuilderHelper
-{
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
+  void register_kernel_builder(BuilderId id, KernelBuilderFunc *func)
   {
+    // Using BuilderId is a duplicate of luci::CirclreOpcode,
+    // size_t(id) is equal to size_t(corresponding operation opcode).
+    assert(size_t(id) < _operator_builders.size());
+    _operator_builders[size_t(id)] = func;
   }
-
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleOutput);
-  DECLARE_VISIT(CirclePRelu);
-  DECLARE_VISIT(CirclePack);
-  DECLARE_VISIT(CirclePad);
-  DECLARE_VISIT(CirclePadV2);
-  DECLARE_VISIT(CirclePow);
-  DECLARE_VISIT(CircleRelu);
-  DECLARE_VISIT(CircleRelu6);
-  DECLARE_VISIT(CircleReshape);
-  DECLARE_VISIT(CircleResizeBilinear);
-  DECLARE_VISIT(CircleResizeNearestNeighbor);
-  DECLARE_VISIT(CircleReverseV2);
-  DECLARE_VISIT(CircleRsqrt);
 };
 
-template <>
-class KernelBuilderLet<KB::STUV> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                   public KernelBuilderHelper
+KernelBuilder::KernelBuilder(
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+  const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+  : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
 {
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
-  {
-  }
-
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleSlice);
-  DECLARE_VISIT(CircleSoftmax);
-  DECLARE_VISIT(CircleSpaceToBatchND);
-  DECLARE_VISIT(CircleSpaceToDepth);
-  DECLARE_VISIT(CircleSplit);
-  DECLARE_VISIT(CircleSqrt);
-  DECLARE_VISIT(CircleSquare);
-  DECLARE_VISIT(CircleSquaredDifference);
-  DECLARE_VISIT(CircleSqueeze);
-  DECLARE_VISIT(CircleStridedSlice);
-  DECLARE_VISIT(CircleSub);
-  DECLARE_VISIT(CircleTanh);
-  DECLARE_VISIT(CircleTranspose);
-  DECLARE_VISIT(CircleTransposeConv);
-  DECLARE_VISIT(CircleUnpack);
-};
+  _builder_registry = std::make_unique<KernelBuilderRegistry>();
+}
 
-template <>
-class KernelBuilderLet<KB::WXYZ> : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>,
-                                   public KernelBuilderHelper
+KernelBuilder::~KernelBuilder()
 {
-public:
-  KernelBuilderLet(
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
-  {
-  }
-
-public:
-  std::unique_ptr<Kernel> visit(const luci::CircleNode *) { return nullptr; }
-
-public:
-  DECLARE_VISIT(CircleWhile);
-};
-
-#undef DECLARE_VISIT
+  // Need to define in this CPP to hide KernelBuilderRegistry internals.
+  // This destructor deletes _builder_registry
+}
 
 std::unique_ptr<Kernel> KernelBuilder::build(const luci::CircleNode *node)
 {
-#define VISIT_KB(GRP)                                                          \
-  do                                                                           \
-  {                                                                            \
-    KernelBuilderLet<KB::GRP> kbl(graph_to_runtime_graph(), node_to_tensor()); \
-    auto ret = node->accept(&kbl);                                             \
-    if (ret != nullptr)                                                        \
-      return ret;                                                              \
-  } while (false)
-
-  VISIT_KB(ABC);
-  VISIT_KB(DEF);
-  VISIT_KB(GHIJ);
-  VISIT_KB(KLMN);
-  VISIT_KB(OPQR);
-  VISIT_KB(STUV);
-  VISIT_KB(WXYZ);
+  auto specific_builder = _builder_registry->get_kernel_builder_func(node->opcode());
+  if (specific_builder != nullptr)
+    return specific_builder(node, *this);
 
-#undef VISIT_KB
   std::string msg = "Unsupported operator: ";
   msg += std::to_string(static_cast<uint32_t>(node->opcode())) + " " + std::string(node->name());
   throw std::invalid_argument(msg.c_str());
 }
 
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleAdd *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  AddParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Add>(input1, input2, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleArgMax *node)
-{
-  assert(node->arity() == 2);
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *axis = getInputTensor(node->dimension());
-  Tensor *output = getOutputTensor(node);
-
-  ArgMaxParams params{};
-  params.output_type = node->output_type();
-
-  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleAveragePool2D *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->value());
-  Tensor *output = getOutputTensor(node);
-
-  Pool2DParams params{};
-  params.padding = node->padding();
-  params.filter_height = node->filter()->h();
-  params.filter_width = node->filter()->w();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::AveragePool2D>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleBatchToSpaceND *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *block_shape = getInputTensor(node->block_shape());
-  const Tensor *crops = getInputTensor(node->crops());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::BatchToSpaceND>(input, block_shape, crops, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleCast *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Cast>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleConcatenation *node)
-{
-  std::vector<const Tensor *> inputs(node->numValues());
-  for (uint32_t i = 0; i < node->numValues(); ++i)
-  {
-    inputs[i] = getInputTensor(node->values(i));
-  }
-  Tensor *output = getOutputTensor(node);
-
-  ConcatenationParams params{};
-  params.axis = node->axis();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Concatenation>(std::move(inputs), output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleConst *)
-{
-  throw std::runtime_error("Const node cannot be executed.");
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::ABC>::visit(const luci::CircleConv2D *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *filter = getInputTensor(node->filter());
-  const Tensor *bias = getInputTensor(node->bias());
-  Tensor *output = getOutputTensor(node);
-
-  Conv2DParams params{};
-  params.padding = node->padding();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-  params.dilation_height_factor = node->dilation()->h();
-  params.dilation_width_factor = node->dilation()->w();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleDepthToSpace *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->input());
-  Tensor *output = getOutputTensor(node);
-
-  DepthToSpaceParams params{};
-  params.block_size = node->block_size();
-
-  return std::make_unique<kernels::DepthToSpace>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleDepthwiseConv2D *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *filter = getInputTensor(node->filter());
-  const Tensor *bias = getInputTensor(node->bias());
-  Tensor *output = getOutputTensor(node);
-
-  DepthwiseConv2DParams params{};
-  params.padding = node->padding();
-  params.depth_multiplier = node->depthMultiplier();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-  params.dilation_height_factor = node->dilation()->h();
-  params.dilation_width_factor = node->dilation()->w();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleDiv *node)
-{
-  assert(node->arity() == 2);
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  DivParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Div>(input1, input2, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleElu *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->features());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Elu>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleEqual *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Equal>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleExp *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Exp>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleFloor *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Floor>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleFloorDiv *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::FloorDiv>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::DEF>::visit(const luci::CircleFullyConnected *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *weights = getInputTensor(node->weights());
-  const Tensor *bias = getOptionalInputTensor(node->bias());
-  Tensor *output = getOutputTensor(node);
-
-  FullyConnectedParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::GHIJ>::visit(const luci::CircleGreater *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Greater>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::GHIJ>::visit(const luci::CircleGreaterEqual *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::GreaterEqual>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::GHIJ>::visit(const luci::CircleIf *node)
-{
-  auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
-  assert(node->arity() == 1 + node->input_count());
-  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
-
-  const Tensor *cond = getInputTensor(node->cond());
-  std::vector<const Tensor *> inputs(node->input_count());
-  for (uint32_t i = 0; i < node->input_count(); ++i)
-  {
-    inputs[i] = getInputTensor(node->input(i));
-  }
-  std::vector<Tensor *> outputs = getOutputTensors(output_nodes);
-
-  RuntimeGraph *then_graph = getRuntimeGraph(node->then_graph());
-  RuntimeGraph *else_graph = getRuntimeGraph(node->else_graph());
-
-  return std::make_unique<kernels::If>(cond, std::move(inputs), std::move(outputs), then_graph,
-                                       else_graph);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::GHIJ>::visit(const luci::CircleInstanceNorm *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *gamma = getInputTensor(node->gamma());
-  const Tensor *beta = getInputTensor(node->beta());
-
-  Tensor *output = getOutputTensor(node);
-
-  InstanceNormParams params{};
-  params.epsilon = node->epsilon();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::InstanceNorm>(input, gamma, beta, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::GHIJ>::visit(const luci::CircleInput *)
-{
-  throw std::runtime_error("Input node cannot be executed.");
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleL2Normalize *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  L2NormParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::L2Normalize>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleL2Pool2D *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->value());
-  Tensor *output = getOutputTensor(node);
-
-  Pool2DParams params{};
-  params.padding = node->padding();
-  params.filter_height = node->filter()->h();
-  params.filter_width = node->filter()->w();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::L2Pool2D>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLeakyRelu *node)
-{
-  assert(node->arity() == 1);
-  const Tensor *input = getInputTensor(node->features());
-  Tensor *output = getOutputTensor(node);
-
-  LeakyReluParams params{};
-  params.alpha = node->alpha();
-
-  return std::make_unique<kernels::LeakyRelu>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLess *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Less>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLessEqual *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::LessEqual>(x, y, output);
-}
-
-std::unique_ptr<Kernel>
-KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLocalResponseNormalization *node)
-{
-  assert(node->arity() == 1);
-  const Tensor *input = getInputTensor(node->input());
-  Tensor *output = getOutputTensor(node);
-
-  LocalResponseNormalizationParams params{};
-  params.radius = node->radius();
-  params.bias = node->bias();
-  params.alpha = node->alpha();
-  params.beta = node->beta();
-
-  return std::make_unique<kernels::LocalResponseNormalization>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLogicalAnd *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::LogicalAnd>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLogicalNot *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::LogicalNot>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLogicalOr *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::LogicalOr>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLogistic *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Logistic>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleLogSoftmax *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->logits());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::LogSoftmax>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMaximum *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Maximum>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMaxPool2D *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->value());
-  Tensor *output = getOutputTensor(node);
-
-  Pool2DParams params{};
-  params.padding = node->padding();
-  params.filter_height = node->filter()->h();
-  params.filter_width = node->filter()->w();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::MaxPool2D>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMean *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *axes = getInputTensor(node->reduction_indices());
-  Tensor *output = getOutputTensor(node);
-
-  ReducerParams params{};
-  params.keep_dims = node->keep_dims();
-
-  return std::make_unique<kernels::Mean>(input, axes, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMinimum *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Minimum>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMirrorPad *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *paddings = getInputTensor(node->paddings());
-  Tensor *output = getOutputTensor(node);
-
-  MirrorPadParams params{};
-  params.mode = node->mode();
-
-  return std::make_unique<kernels::MirrorPad>(input, paddings, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleMul *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  MulParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Mul>(input1, input2, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleNeg *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Neg>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::KLMN>::visit(const luci::CircleNotEqual *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *x = getInputTensor(node->x());
-  const Tensor *y = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::NotEqual>(x, y, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleOutput *)
-{
-  throw std::runtime_error("Output node cannot be executed.");
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CirclePack *node)
-{
-  assert(node->arity() == node->values_count());
-
-  std::vector<const Tensor *> inputs(node->values_count());
-  for (uint32_t i = 0; i < node->values_count(); ++i)
-  {
-    inputs[i] = getInputTensor(node->values(i));
-  }
-  Tensor *output = getOutputTensor(node);
-
-  PackParams params{};
-  params.axis = node->axis();
-  params.values_count = node->values_count();
-
-  return std::make_unique<kernels::Pack>(std::move(inputs), output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CirclePad *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *paddings = getInputTensor(node->paddings());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Pad>(input, paddings, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CirclePadV2 *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *paddings = getInputTensor(node->paddings());
-  const Tensor *constant_values = getInputTensor(node->constant_values());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::PadV2>(input, paddings, constant_values, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CirclePow *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Pow>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CirclePRelu *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *alpha = getInputTensor(node->alpha());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::PRelu>(input, alpha, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleRelu *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->features());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Relu>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleRelu6 *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->features());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Relu6>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleReshape *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->tensor());
-  const Tensor *shape = getInputTensor(node->shape());
-  Tensor *output = getOutputTensor(node);
-
-  // NOTE 'newShape' attribute is ignored.
-  return std::make_unique<kernels::Reshape>(input, shape, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleResizeBilinear *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *size = getInputTensor(node->size());
-  Tensor *output = getOutputTensor(node);
-
-  ResizeBilinearParams params{};
-  params.align_corners = node->align_corners();
-  params.half_pixel_centers = node->half_pixel_centers();
-
-  return std::make_unique<kernels::ResizeBilinear>(input, size, output, params);
-}
-
-std::unique_ptr<Kernel>
-KernelBuilderLet<KB::OPQR>::visit(const luci::CircleResizeNearestNeighbor *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *size = getInputTensor(node->size());
-  Tensor *output = getOutputTensor(node);
-
-  ResizeNearestNeighborParams params{};
-  params.align_corners = node->align_corners();
-  // TODO update half_pixel_centers after CircleResizeNearestNeighbor updated
-  // Current CircleResizeNearestNeighbor don't have half_pixel_centers.
-  // default value on current is false.
-  // it need to be updated when CircleResizeNearestNeighbor updated.
-  params.half_pixel_centers = false;
-
-  return std::make_unique<kernels::ResizeNearestNeighbor>(input, size, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleReverseV2 *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->tensor());
-  const Tensor *axes = getInputTensor(node->axis());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::ReverseV2>(input, axes, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::OPQR>::visit(const luci::CircleRsqrt *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Rsqrt>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSlice *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *begin = getInputTensor(node->begin());
-  const Tensor *size = getInputTensor(node->size());
-
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Slice>(input, begin, size, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSoftmax *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->logits());
-  Tensor *output = getOutputTensor(node);
-
-  SoftmaxParams params{};
-  params.beta = node->beta();
-
-  return std::make_unique<kernels::Softmax>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSpaceToBatchND *node)
-{
-  assert(node->arity() == 3);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *block_shape = getInputTensor(node->block_shape());
-  const Tensor *paddings = getInputTensor(node->paddings());
-
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::SpaceToBatchND>(input, block_shape, paddings, output);
-  ;
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSpaceToDepth *node)
-{
-  assert(node->arity() == 1);
-  const Tensor *input = getInputTensor(node->input());
-
-  Tensor *output = getOutputTensor(node);
-
-  SpaceToDepthParams params{};
-  params.block_size = node->block_size();
-
-  return std::make_unique<kernels::SpaceToDepth>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSplit *node)
-{
-  auto output_nodes = collectOutputNodes<luci::CircleSplitOut>(node);
-  assert(node->arity() == 2);
-  assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
-
-  const Tensor *axis = getInputTensor(node->split_dim());
-  const Tensor *input = getInputTensor(node->input());
-  std::vector<Tensor *> outputs = getOutputTensors(output_nodes);
-
-  // NOTE 'num_splits' attribute is ignored.
-  return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSqrt *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Sqrt>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSquare *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Square>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSquaredDifference *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::SquaredDifference>(input1, input2, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSqueeze *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->input());
-  Tensor *output = getOutputTensor(node);
-
-  SqueezeParams params{};
-  params.squeeze_dims = node->squeeze_dims();
-
-  return std::make_unique<kernels::Squeeze>(input, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleStridedSlice *node)
-{
-  assert(node->arity() == 4);
-
-  const Tensor *input = getInputTensor(node->input());
-  const Tensor *begin = getInputTensor(node->begin());
-  const Tensor *end = getInputTensor(node->end());
-  const Tensor *strides = getInputTensor(node->strides());
-
-  Tensor *output = getOutputTensor(node);
-
-  StridedSliceParams params{};
-  params.begin_mask = node->begin_mask();
-  params.ellipsis_mask = node->ellipsis_mask();
-  params.end_mask = node->end_mask();
-  params.new_axis_mask = node->new_axis_mask();
-  params.shrink_axis_mask = node->shrink_axis_mask();
-
-  return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleSub *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  SubParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Sub>(input1, input2, output, params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleTanh *node)
-{
-  assert(node->arity() == 1);
-
-  const Tensor *input = getInputTensor(node->x());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Tanh>(input, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleTranspose *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->a());
-  const Tensor *perm = getInputTensor(node->perm());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Transpose>(input, perm, output);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleTransposeConv *node)
-{
-  assert(node->arity() == 4);
-
-  const Tensor *input_sizes = getInputTensor(node->inputSizes());
-  const Tensor *filter = getInputTensor(node->filter());
-  const Tensor *out_backprop = getInputTensor(node->outBackprop());
-  const Tensor *bias = getOptionalInputTensor(node->bias());
-
-  Tensor *output = getOutputTensor(node);
-
-  TransposeConvParams params{};
-  params.padding = node->padding();
-  params.stride_height = node->stride()->h();
-  params.stride_width = node->stride()->w();
-
-  return std::make_unique<kernels::TransposeConv>(input_sizes, filter, out_backprop, bias, output,
-                                                  params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::STUV>::visit(const luci::CircleUnpack *node)
-{
-  auto output_nodes = collectOutputNodes<luci::CircleUnpackOut>(node);
-  assert(node->arity() == 1);
-  assert(output_nodes.size() == static_cast<size_t>(node->num()));
-
-  const Tensor *input = getInputTensor(node->value());
-  std::vector<Tensor *> outputs = getOutputTensors(output_nodes);
-
-  UnpackParams params{};
-  params.axis = node->axis();
-
-  // NOTE 'num' attribute is ignored.
-  return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
-}
-
-std::unique_ptr<Kernel> KernelBuilderLet<KB::WXYZ>::visit(const luci::CircleWhile *node)
-{
-  auto output_nodes = collectOutputNodes<luci::CircleWhileOut>(node);
-  assert(node->arity() == node->input_count());
-  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
-
-  std::vector<const Tensor *> inputs(node->input_count());
-  for (uint32_t i = 0; i < node->input_count(); ++i)
-  {
-    inputs[i] = getInputTensor(node->input(i));
-  }
-  std::vector<Tensor *> outputs = getOutputTensors(output_nodes);
-
-  RuntimeGraph *cond_graph = getRuntimeGraph(node->cond_graph());
-  RuntimeGraph *body_graph = getRuntimeGraph(node->body_graph());
-
-  return std::make_unique<kernels::While>(std::move(inputs), std::move(outputs), cond_graph,
-                                          body_graph);
-}
-
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 406c41ef6..b1f383394 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -30,17 +30,21 @@
 namespace luci_interpreter
 {
 
+class KernelBuilderRegistry;
+
 class KernelBuilder : public KernelBuilderHelper
 {
 public:
   KernelBuilder(
     const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : KernelBuilderHelper(graph_to_runtime_graph, node_to_tensor)
-  {
-  }
+    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+
+  ~KernelBuilder();
 
   std::unique_ptr<Kernel> build(const luci::CircleNode *node);
+
+private:
+  std::unique_ptr<KernelBuilderRegistry> _builder_registry;
 };
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
index d8611243e..7a457a62f 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -16,6 +16,7 @@
 
 #include "loader/GraphLoader.h"
 #include "loader/KernelBuilder.h"
+#include "luci_interpreter/SimpleMemoryManager.h"
 
 #include <kernels/Add.h>
 #include <kernels/ArgMax.h>
@@ -68,6 +69,7 @@
 #include <kernels/Softmax.h>
 #include <kernels/SpaceToDepth.h>
 #include <kernels/Split.h>
+#include <kernels/SplitV.h>
 #include <kernels/Sqrt.h>
 #include <kernels/SquaredDifference.h>
 #include <kernels/Squeeze.h>
@@ -91,6 +93,9 @@ class KernelBuilderTest : public Test
 {
 protected:
   luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
+  void SetUp() override { _memory_manager = std::make_unique<SimpleMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
 
   template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
   {
@@ -114,10 +119,11 @@ protected:
   {
     std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
 
-    RuntimeGraph runtime_graph(nullptr);
+    RuntimeGraph runtime_graph(nullptr, _memory_manager.get());
+    graph_to_runtime_graph[&_graph] = &runtime_graph;
     RuntimeToIR runtime_to_ir;
     GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
-                             _node_to_tensor);
+                             _node_to_tensor, _memory_manager.get());
     graph_loader.loadTensors();
 
     KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
@@ -1091,6 +1097,31 @@ TEST_F(KernelBuilderTest, Split)
   checkTensor(kernel->output(1), output2);
 }
 
+TEST_F(KernelBuilderTest, SplitV)
+{
+  auto *input = createInputNode();
+  auto *size_splits = createInputNode();
+  auto *axis = createInputNode();
+  auto *op = createNode<luci::CircleSplitV>();
+  auto *output0 = createNodeOut<luci::CircleSplitVOut>(op, 0);
+  auto *output1 = createNodeOut<luci::CircleSplitVOut>(op, 1);
+
+  op->input(input);
+  op->size_splits(size_splits);
+  op->split_dim(axis);
+
+  op->num_split(2);
+
+  auto kernel = buildKernel<kernels::SplitV>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->size_splits(), size_splits);
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->output(0), output0);
+  checkTensor(kernel->output(1), output1);
+}
+
 TEST_F(KernelBuilderTest, Sqrt)
 {
   auto *input = createInputNode();
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilderHelper.h b/compiler/luci-interpreter/src/loader/KernelBuilderHelper.h
index 4517d1f19..d6fb253b1 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilderHelper.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilderHelper.h
@@ -39,7 +39,7 @@ public:
   {
   }
 
-protected:
+public:
   const Tensor *getInputTensor(const loco::Node *node) const;
   const Tensor *getOptionalInputTensor(const loco::Node *node) const;
 
@@ -48,7 +48,7 @@ protected:
 
   RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
 
-protected:
+public:
   const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph() const
   {
     return _graph_to_runtime_graph;
@@ -64,6 +64,21 @@ private:
   const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
 };
 
+template <typename CircleNodeOut>
+std::vector<const loco::Node *> collectOutputNodes(const loco::Node *node)
+{
+  std::vector<const CircleNodeOut *> output_nodes;
+  for (const loco::Node *loco_node : loco::succs(node))
+  {
+    output_nodes.push_back(loco::must_cast<const CircleNodeOut *>(loco_node));
+  }
+  std::sort(output_nodes.begin(), output_nodes.end(),
+            [](const CircleNodeOut *node1, const CircleNodeOut *node2) {
+              return node1->index() < node2->index();
+            });
+  return {output_nodes.cbegin(), output_nodes.cend()};
+}
+
 } // namespace luci_interpreter
 
 #endif // LUCI_INTERPRETER_LOADER_KERNELBUILDER_HELPER_H
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
index ff211bf09..2f278b087 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -23,9 +23,10 @@ namespace luci_interpreter
 
 ModuleLoader::ModuleLoader(const luci::Module *module, RuntimeModule *runtime_module,
                            RuntimeToIR &runtime_to_ir,
-                           std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+                           std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+                           IMemoryManager *memory_manager)
   : _module(module), _runtime_module(runtime_module), _runtime_to_ir(runtime_to_ir),
-    _node_to_tensor(node_to_tensor)
+    _node_to_tensor(node_to_tensor), _memory_manager(memory_manager)
 {
 }
 
@@ -35,14 +36,14 @@ void ModuleLoader::load()
   // process for control flow nodes.
   for (size_t i = 0; i < _module->size(); ++i)
   {
-    _graph_to_runtime_graph.emplace(_module->graph(i), _runtime_module->addGraph());
+    _graph_to_runtime_graph.emplace(_module->graph(i), _runtime_module->addGraph(_memory_manager));
   }
   for (size_t i = 0; i < _module->size(); ++i)
   {
     const loco::Graph *graph = _module->graph(i);
     RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
     GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
-                       _node_to_tensor);
+                       _node_to_tensor, _memory_manager);
     loader.loadTensors();
     loader.initInputOutputTensors();
     loader.loadOperators();
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
index 1af0ed747..11326a2ee 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
@@ -19,6 +19,7 @@
 
 #include "core/RuntimeModule.h"
 #include "loader/RuntimeToIR.h"
+#include "luci_interpreter/MemoryManager.h"
 
 #include <luci/IR/Module.h>
 
@@ -32,11 +33,13 @@ class ModuleLoader
 public:
   ModuleLoader(const luci::Module *module, RuntimeModule *runtime_module,
                RuntimeToIR &runtime_to_ir,
-               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor,
+               IMemoryManager *memory_manager);
 
   void load();
 
 private:
+  IMemoryManager *_memory_manager;
   const luci::Module *_module;
   RuntimeModule *_runtime_module;
   RuntimeToIR &_runtime_to_ir;
diff --git a/compiler/luci-interpreter/src/loader/nodes/Add.cpp b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
new file mode 100644
index 000000000..decccaa1d
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Add.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleAdd(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleAdd *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  AddParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Add>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
new file mode 100644
index 000000000..0ee367748
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ArgMax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleArgMax(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleArgMax *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axis = helper.getInputTensor(node->dimension());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ArgMaxParams params{};
+  params.output_type = node->output_type();
+
+  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
new file mode 100644
index 000000000..5bc37bd4a
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/AveragePool2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleAveragePool2D(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleAveragePool2D *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::AveragePool2D>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
new file mode 100644
index 000000000..33d0e2db6
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/BatchToSpaceND.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleBatchToSpaceND(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleBatchToSpaceND *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *block_shape = helper.getInputTensor(node->block_shape());
+  const Tensor *crops = helper.getInputTensor(node->crops());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::BatchToSpaceND>(input, block_shape, crops, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Builders.h b/compiler/luci-interpreter/src/loader/nodes/Builders.h
new file mode 100644
index 000000000..eab284008
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Builders.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
+#define LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
+
+#include "loader/KernelBuilderHelper.h"
+
+#include "luci/IR/CircleNodes.h"
+
+namespace luci_interpreter
+{
+
+#define REGISTER_KERNEL(name)                                                            \
+  std::unique_ptr<Kernel> build_kernel_Circle##name(const luci::CircleNode *circle_node, \
+                                                    KernelBuilderHelper &helper);
+
+#include "KernelsToBuild.lst"
+
+#undef REGISTER_KERNEL
+
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_LOADER_NODES_BUILDERS_H
diff --git a/compiler/luci-interpreter/src/loader/nodes/Cast.cpp b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
new file mode 100644
index 000000000..21ea5ceab
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Cast.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleCast(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleCast *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Cast>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
new file mode 100644
index 000000000..7823a9967
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Concatenation.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleConcatenation(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleConcatenation *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  std::vector<const Tensor *> inputs(node->numValues());
+  for (uint32_t i = 0; i < node->numValues(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->values(i));
+  }
+  Tensor *output = helper.getOutputTensor(node);
+
+  ConcatenationParams params{};
+  params.axis = node->axis();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Concatenation>(std::move(inputs), output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
new file mode 100644
index 000000000..71c8ef3e4
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Conv2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleConv2D *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *bias = helper.getInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto im2col =
+    std::make_unique<Tensor>(input->element_type(), Shape({}), AffineQuantization{}, "");
+  im2col->set_observable(false);
+  im2col->set_data_buffer(nullptr);
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(im2col));
+
+  Conv2DParams params{};
+  params.padding = node->padding();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.dilation_height_factor = node->dilation()->h();
+  params.dilation_width_factor = node->dilation()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Conv2D>(input, filter, bias, output, tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
new file mode 100644
index 000000000..0310fb23f
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/DepthToSpace.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDepthToSpace(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleDepthToSpace *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DepthToSpaceParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::DepthToSpace>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..c2f0346a2
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/DepthwiseConv2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDepthwiseConv2D(const luci::CircleNode *circle_node,
+                                                           KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *bias = helper.getInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DepthwiseConv2DParams params{};
+  params.padding = node->padding();
+  params.depth_multiplier = node->depthMultiplier();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.dilation_height_factor = node->dilation()->h();
+  params.dilation_width_factor = node->dilation()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Div.cpp b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
new file mode 100644
index 000000000..56c2e98f2
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Div.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleDiv(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleDiv *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  DivParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Div>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Elu.cpp b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
new file mode 100644
index 000000000..98ee78be7
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Elu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleElu(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleElu *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Elu>(input, output);
+}
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Equal.cpp b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
new file mode 100644
index 000000000..649d9bfe9
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Equal.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleEqual(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+
+{
+  const auto *node = dynamic_cast<const luci::CircleEqual *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Equal>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Exp.cpp b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
new file mode 100644
index 000000000..411d142c3
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Exp.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleExp(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleExp *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Exp>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Floor.cpp b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
new file mode 100644
index 000000000..6d8435f6c
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Floor.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFloor(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleFloor *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Floor>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
new file mode 100644
index 000000000..cae2e186e
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/FloorDiv.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFloorDiv(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleFloorDiv *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::FloorDiv>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
new file mode 100644
index 000000000..2917598fc
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/FullyConnected.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleFullyConnected *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *weights = helper.getInputTensor(node->weights());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+  Tensor *output = helper.getOutputTensor(node);
+
+  FullyConnectedParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Greater.cpp b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
new file mode 100644
index 000000000..3db11b840
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Greater.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGreater(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleGreater *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Greater>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
new file mode 100644
index 000000000..dbe051d67
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/GreaterEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGreaterEqual(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleGreaterEqual *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::GreaterEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/If.cpp b/compiler/luci-interpreter/src/loader/nodes/If.cpp
new file mode 100644
index 000000000..5983f4d3b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/If.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/If.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleIf(const luci::CircleNode *circle_node,
+                                              KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleIf *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
+  assert(node->arity() == 1 + node->input_count());
+  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
+
+  const Tensor *cond = helper.getInputTensor(node->cond());
+  std::vector<const Tensor *> inputs(node->input_count());
+  for (uint32_t i = 0; i < node->input_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->input(i));
+  }
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  RuntimeGraph *then_graph = helper.getRuntimeGraph(node->then_graph());
+  RuntimeGraph *else_graph = helper.getRuntimeGraph(node->else_graph());
+
+  return std::make_unique<kernels::If>(cond, std::move(inputs), std::move(outputs), then_graph,
+                                       else_graph);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
new file mode 100644
index 000000000..0a8fb85e2
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/InstanceNorm.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleInstanceNorm(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleInstanceNorm *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *gamma = helper.getInputTensor(node->gamma());
+  const Tensor *beta = helper.getInputTensor(node->beta());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  InstanceNormParams params{};
+  params.epsilon = node->epsilon();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::InstanceNorm>(input, gamma, beta, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
new file mode 100644
index 000000000..05f920266
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/L2Normalize.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleL2Normalize(const luci::CircleNode *circle_node,
+                                                       KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleL2Normalize *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  L2NormParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::L2Normalize>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
new file mode 100644
index 000000000..0e70afafa
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/L2Pool2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleL2Pool2D(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleL2Pool2D *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::L2Pool2D>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
new file mode 100644
index 000000000..7b229ad0e
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LeakyRelu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLeakyRelu(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLeakyRelu *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  LeakyReluParams params{};
+  params.alpha = node->alpha();
+
+  return std::make_unique<kernels::LeakyRelu>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Less.cpp b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
new file mode 100644
index 000000000..81156f275
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Less.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLess(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLess *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Less>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
new file mode 100644
index 000000000..82141e5ae
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LessEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLessEqual(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLessEqual *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LessEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
new file mode 100644
index 000000000..a12dce0a0
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LocalResponseNormalization.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel>
+build_kernel_CircleLocalResponseNormalization(const luci::CircleNode *circle_node,
+                                              KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  LocalResponseNormalizationParams params{};
+  params.radius = node->radius();
+  params.bias = node->bias();
+  params.alpha = node->alpha();
+  params.beta = node->beta();
+
+  return std::make_unique<kernels::LocalResponseNormalization>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
new file mode 100644
index 000000000..6cf547aae
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogSoftmax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogSoftmax(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLogSoftmax *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->logits());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogSoftmax>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
new file mode 100644
index 000000000..2c9549f71
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalAnd.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalAnd(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLogicalAnd *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalAnd>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
new file mode 100644
index 000000000..3d327d6c4
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalNot.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalNot(const luci::CircleNode *circle_node,
+                                                      KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLogicalNot *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalNot>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
new file mode 100644
index 000000000..50566bb30
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/LogicalOr.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogicalOr(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLogicalOr *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::LogicalOr>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
new file mode 100644
index 000000000..e4160edb3
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Logistic.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleLogistic(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleLogistic *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Logistic>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
new file mode 100644
index 000000000..914f22838
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/MaxPool2D.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMaxPool2D(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMaxPool2D *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  Tensor *output = helper.getOutputTensor(node);
+
+  Pool2DParams params{};
+  params.padding = node->padding();
+  params.filter_height = node->filter()->h();
+  params.filter_width = node->filter()->w();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::MaxPool2D>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
new file mode 100644
index 000000000..dc50d6773
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Maximum.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMaximum(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMaximum *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Maximum>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mean.cpp b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
new file mode 100644
index 000000000..97d91207f
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Mean.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMean(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMean *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axes = helper.getInputTensor(node->reduction_indices());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto temp_index_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  temp_index_unique->set_observable(false);
+  temp_index_unique->set_data_buffer(nullptr);
+  Tensor *temp_index =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_index_unique));
+
+  auto resolved_axes_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  resolved_axes_unique->set_observable(false);
+  resolved_axes_unique->set_data_buffer(nullptr);
+  Tensor *resolved_axes =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(resolved_axes_unique));
+
+  auto temp_sum_unique =
+    std::make_unique<Tensor>(input->element_type(), Shape({}), AffineQuantization{}, "");
+  temp_sum_unique->set_observable(false);
+  temp_sum_unique->set_data_buffer(nullptr);
+  Tensor *temp_sum = helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_sum_unique));
+
+  ReducerParams params{};
+  params.keep_dims = node->keep_dims();
+
+  return std::make_unique<kernels::Mean>(input, axes, output, temp_index, resolved_axes, temp_sum,
+                                         params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
new file mode 100644
index 000000000..ff659524a
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Minimum.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMinimum(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMinimum *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Minimum>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
new file mode 100644
index 000000000..ebf294583
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/MirrorPad.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMirrorPad(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMirrorPad *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  Tensor *output = helper.getOutputTensor(node);
+
+  MirrorPadParams params{};
+  params.mode = node->mode();
+
+  return std::make_unique<kernels::MirrorPad>(input, paddings, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mul.cpp b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
new file mode 100644
index 000000000..4f9da967d
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Mul.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleMul(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleMul *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  MulParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Mul>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Neg.cpp b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
new file mode 100644
index 000000000..23c00537b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Neg.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleNeg(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleNeg *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Neg>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
new file mode 100644
index 000000000..8e5711fc1
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/NotEqual.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleNotEqual(const luci::CircleNode *circle_node,
+                                                    KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleNotEqual *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *x = helper.getInputTensor(node->x());
+  const Tensor *y = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::NotEqual>(x, y, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
new file mode 100644
index 000000000..e31601bf6
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/PRelu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePRelu(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CirclePRelu *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *alpha = helper.getInputTensor(node->alpha());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::PRelu>(input, alpha, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pack.cpp b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
new file mode 100644
index 000000000..699472081
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pack.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePack(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CirclePack *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == node->values_count());
+
+  std::vector<const Tensor *> inputs(node->values_count());
+  for (uint32_t i = 0; i < node->values_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->values(i));
+  }
+  Tensor *output = helper.getOutputTensor(node);
+
+  PackParams params{};
+  params.axis = node->axis();
+  params.values_count = node->values_count();
+
+  return std::make_unique<kernels::Pack>(std::move(inputs), output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pad.cpp b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
new file mode 100644
index 000000000..770549295
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pad.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePad(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CirclePad *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Pad>(input, paddings, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
new file mode 100644
index 000000000..12deb15f0
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/PadV2.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePadV2(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CirclePadV2 *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+  const Tensor *constant_values = helper.getInputTensor(node->constant_values());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::PadV2>(input, paddings, constant_values, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pow.cpp b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
new file mode 100644
index 000000000..b430bc94f
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Pow.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CirclePow(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CirclePow *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Pow>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
new file mode 100644
index 000000000..d53a66a06
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Relu.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRelu(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleRelu *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
new file mode 100644
index 000000000..f1b5d219b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Relu6.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRelu6(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleRelu6 *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->features());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Relu6>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
new file mode 100644
index 000000000..89e3ecebf
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Reshape.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReshape(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleReshape *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->tensor());
+  const Tensor *shape = helper.getInputTensor(node->shape());
+  Tensor *output = helper.getOutputTensor(node);
+
+  // NOTE 'newShape' attribute is ignored.
+  return std::make_unique<kernels::Reshape>(input, shape, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
new file mode 100644
index 000000000..dca56588d
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ResizeBilinear.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleResizeBilinear(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleResizeBilinear *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *size = helper.getInputTensor(node->size());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ResizeBilinearParams params{};
+  params.align_corners = node->align_corners();
+  params.half_pixel_centers = node->half_pixel_centers();
+
+  return std::make_unique<kernels::ResizeBilinear>(input, size, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
new file mode 100644
index 000000000..d1ea19c0f
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ResizeNearestNeighbor.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel>
+build_kernel_CircleResizeNearestNeighbor(const luci::CircleNode *circle_node,
+                                         KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *size = helper.getInputTensor(node->size());
+  Tensor *output = helper.getOutputTensor(node);
+
+  ResizeNearestNeighborParams params{};
+  params.align_corners = node->align_corners();
+  // TODO update half_pixel_centers after CircleResizeNearestNeighbor updated
+  // Current CircleResizeNearestNeighbor don't have half_pixel_centers.
+  // default value on current is false.
+  // it need to be updated when CircleResizeNearestNeighbor updated.
+  params.half_pixel_centers = false;
+
+  return std::make_unique<kernels::ResizeNearestNeighbor>(input, size, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
new file mode 100644
index 000000000..ea00f5408
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ReverseV2.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReverseV2(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleReverseV2 *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->tensor());
+  const Tensor *axes = helper.getInputTensor(node->axis());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::ReverseV2>(input, axes, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
new file mode 100644
index 000000000..ff87f435c
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Rsqrt.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleRsqrt(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleRsqrt *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Rsqrt>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Slice.cpp b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
new file mode 100644
index 000000000..741cd0806
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Slice.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSlice(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSlice *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *begin = helper.getInputTensor(node->begin());
+  const Tensor *size = helper.getInputTensor(node->size());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Slice>(input, begin, size, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
new file mode 100644
index 000000000..b15e4b6f3
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Softmax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSoftmax(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSoftmax *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->logits());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SoftmaxParams params{};
+  params.beta = node->beta();
+
+  return std::make_unique<kernels::Softmax>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
new file mode 100644
index 000000000..91c237aa5
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SpaceToBatchND.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSpaceToBatchND(const luci::CircleNode *circle_node,
+                                                          KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSpaceToBatchND *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 3);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *block_shape = helper.getInputTensor(node->block_shape());
+  const Tensor *paddings = helper.getInputTensor(node->paddings());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::SpaceToBatchND>(input, block_shape, paddings, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
new file mode 100644
index 000000000..3cbbd9718
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SpaceToDepth.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSpaceToDepth(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSpaceToDepth *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+  const Tensor *input = helper.getInputTensor(node->input());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  SpaceToDepthParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::SpaceToDepth>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Split.cpp b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
new file mode 100644
index 000000000..32553ad5e
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Split.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSplit(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSplit *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  auto output_nodes = collectOutputNodes<luci::CircleSplitOut>(node);
+  assert(node->arity() == 2);
+  assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
+
+  const Tensor *axis = helper.getInputTensor(node->split_dim());
+  const Tensor *input = helper.getInputTensor(node->input());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  // NOTE 'num_splits' attribute is ignored.
+  return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
new file mode 100644
index 000000000..d78816447
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SplitV.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSplitV(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSplitV *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  auto output_nodes = collectOutputNodes<luci::CircleSplitVOut>(node);
+  assert(node->arity() == 3);
+  assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *sizes_data = helper.getInputTensor(node->size_splits());
+  const Tensor *axis = helper.getInputTensor(node->split_dim());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  // NOTE 'num_splits' attribute is ignored.
+  return std::make_unique<kernels::SplitV>(input, sizes_data, axis, std::move(outputs));
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
new file mode 100644
index 000000000..56dd986f1
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Sqrt.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSqrt(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSqrt *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Sqrt>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Square.cpp b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
new file mode 100644
index 000000000..43aadb969
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Square.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSquare(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSquare *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Square>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
new file mode 100644
index 000000000..6a2717aa2
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/SquaredDifference.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSquaredDifference(const luci::CircleNode *circle_node,
+                                                             KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSquaredDifference *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::SquaredDifference>(input1, input2, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
new file mode 100644
index 000000000..583ff9314
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Squeeze.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSqueeze(const luci::CircleNode *circle_node,
+                                                   KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSqueeze *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SqueezeParams params{};
+  params.squeeze_dims = node->squeeze_dims();
+
+  return std::make_unique<kernels::Squeeze>(input, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
new file mode 100644
index 000000000..fe5fa7707
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/StridedSlice.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleStridedSlice(const luci::CircleNode *circle_node,
+                                                        KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleStridedSlice *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 4);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *begin = helper.getInputTensor(node->begin());
+  const Tensor *end = helper.getInputTensor(node->end());
+  const Tensor *strides = helper.getInputTensor(node->strides());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  StridedSliceParams params{};
+  params.begin_mask = node->begin_mask();
+  params.ellipsis_mask = node->ellipsis_mask();
+  params.end_mask = node->end_mask();
+  params.new_axis_mask = node->new_axis_mask();
+  params.shrink_axis_mask = node->shrink_axis_mask();
+
+  return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sub.cpp b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
new file mode 100644
index 000000000..bad4fbb13
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Sub.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleSub(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleSub *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = helper.getInputTensor(node->x());
+  const Tensor *input2 = helper.getInputTensor(node->y());
+  Tensor *output = helper.getOutputTensor(node);
+
+  SubParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Sub>(input1, input2, output, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
new file mode 100644
index 000000000..f4255291b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Tanh.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTanh(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleTanh *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 1);
+
+  const Tensor *input = helper.getInputTensor(node->x());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Tanh>(input, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
new file mode 100644
index 000000000..4e095fbbc
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Transpose.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTranspose(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleTranspose *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->a());
+  const Tensor *perm = helper.getInputTensor(node->perm());
+  Tensor *output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Transpose>(input, perm, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
new file mode 100644
index 000000000..1b954c35c
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/TransposeConv.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleTransposeConv(const luci::CircleNode *circle_node,
+                                                         KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleTransposeConv *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  assert(node->arity() == 4);
+
+  const Tensor *input_sizes = helper.getInputTensor(node->inputSizes());
+  const Tensor *filter = helper.getInputTensor(node->filter());
+  const Tensor *out_backprop = helper.getInputTensor(node->outBackprop());
+  const Tensor *bias = helper.getOptionalInputTensor(node->bias());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  DataType scratch_data_type =
+    helper.getInputTensor(node)->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+
+  auto scratch_tensor =
+    std::make_unique<Tensor>(scratch_data_type, Shape({}), AffineQuantization{}, "");
+  scratch_tensor->set_observable(false);
+  scratch_tensor->set_data_buffer(nullptr);
+  Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratch_tensor));
+
+  TransposeConvParams params{};
+  params.padding = node->padding();
+  params.stride_height = node->stride()->h();
+  params.stride_width = node->stride()->w();
+
+  return std::make_unique<kernels::TransposeConv>(input_sizes, filter, out_backprop, bias, output,
+                                                  tmp, params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
new file mode 100644
index 000000000..978c738c6
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Unpack.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleUnpack(const luci::CircleNode *circle_node,
+                                                  KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleUnpack *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+  auto output_nodes = collectOutputNodes<luci::CircleUnpackOut>(node);
+  assert(node->arity() == 1);
+  assert(output_nodes.size() == static_cast<size_t>(node->num()));
+
+  const Tensor *input = helper.getInputTensor(node->value());
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  UnpackParams params{};
+  params.axis = node->axis();
+
+  // NOTE 'num' attribute is ignored.
+  return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/While.cpp b/compiler/luci-interpreter/src/loader/nodes/While.cpp
new file mode 100644
index 000000000..284dc0c68
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/While.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/While.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleWhile(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = dynamic_cast<const luci::CircleWhile *>(circle_node);
+  if (node == nullptr)
+    throw std::runtime_error("wrong builder for operation");
+
+  auto output_nodes = collectOutputNodes<luci::CircleWhileOut>(node);
+  assert(node->arity() == node->input_count());
+  assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
+
+  std::vector<const Tensor *> inputs(node->input_count());
+  for (uint32_t i = 0; i < node->input_count(); ++i)
+  {
+    inputs[i] = helper.getInputTensor(node->input(i));
+  }
+  std::vector<Tensor *> outputs = helper.getOutputTensors(output_nodes);
+
+  RuntimeGraph *cond_graph = helper.getRuntimeGraph(node->cond_graph());
+  RuntimeGraph *body_graph = helper.getRuntimeGraph(node->body_graph());
+
+  return std::make_unique<kernels::While>(std::move(inputs), std::move(outputs), cond_graph,
+                                          body_graph);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-micro/CMakeLists.txt b/compiler/luci-micro/CMakeLists.txt
new file mode 100644
index 000000000..d936e12ba
--- /dev/null
+++ b/compiler/luci-micro/CMakeLists.txt
@@ -0,0 +1,57 @@
+set(ARM_C_COMPILER "arm-none-eabi-gcc")
+set(ARM_ASM_COMPILER "arm-none-eabi-gcc")
+set(ARM_CXX_COMPILER "arm-none-eabi-g++")
+set(ARM_OBJCOPY "arm-none-eabi-objcopy")
+
+find_program(ARM_C_COMPILER_PATH ${ARM_C_COMPILER})
+
+if(NOT ARM_C_COMPILER_PATH)
+  message(WARNING "ARM compiler is NOT FOUND, skipping luci-micro build")
+  return()
+endif()
+
+set(CMAKE_ARM_OPTIONS
+  -DLUCI_INTERPRETER_STATIC=ON
+  -DLUCI_STATIC=ON
+  "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/standalone/Toolchain.cmake"
+  "-DLUCI_INTERPRETER_PAL_DIR=${CMAKE_CURRENT_SOURCE_DIR}/../luci-interpreter/pal/mcu"
+  "-DNNAS_PROJECT_SOURCE_DIR=${NNAS_PROJECT_SOURCE_DIR}"
+  "-DNNAS_EXTERNALS_DIR=${NNAS_EXTERNALS_DIR}"
+  -DCPU_ARCH=arm
+  -DC_COMPILER=${ARM_C_COMPILER}
+  -DCXX_COMPILER=${ARM_CXX_COMPILER}
+  -DASM_COMPILER=${ARM_ASM_COMPILER}
+  -DOBJCOPY=${ARM_OBJCOPY}
+  -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+  -DENABLE_TEST=OFF
+  -DBUILD_GTEST=OFF
+  "-DNNAS_ROOT=${NNAS_PROJECT_SOURCE_DIR}"
+  -DENABLE_STRICT_BUILD=OFF
+)
+
+set(MICRO_ARM_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/standalone_arm")
+file(MAKE_DIRECTORY "${MICRO_ARM_BUILD_DIR}")
+
+set(MICRO_ARM_BUILD_DEPENDENCY "${MICRO_ARM_BUILD_DIR}/CMakeCache.txt")
+
+add_custom_command(
+  OUTPUT "${MICRO_ARM_BUILD_DEPENDENCY}"
+  COMMAND "${CMAKE_COMMAND}" "${CMAKE_CURRENT_SOURCE_DIR}/standalone" ${CMAKE_ARM_OPTIONS}
+  WORKING_DIRECTORY "${MICRO_ARM_BUILD_DIR}"
+  DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/standalone/CMakeLists.txt"
+  VERBATIM
+)
+
+add_custom_target(luci_interpreter_micro_arm_cmake DEPENDS "${MICRO_ARM_BUILD_DEPENDENCY}")
+
+set(MICRO_ARM_BINARY "${MICRO_ARM_BUILD_DIR}/compiler/luci-interpreter/src/libluci_interpreter.a")
+
+add_custom_command(
+  OUTPUT "${MICRO_ARM_BINARY}"
+  COMMAND "${CMAKE_MAKE_PROGRAM}" luci_interpreter -j ${CPU_COUNT}
+  WORKING_DIRECTORY "${MICRO_ARM_BUILD_DIR}"
+  DEPENDS luci_interpreter_micro_arm_cmake
+  VERBATIM
+)
+
+add_custom_target(luci_interpreter_micro_arm DEPENDS "${MICRO_ARM_BINARY}")
diff --git a/compiler/luci-micro/README.md b/compiler/luci-micro/README.md
new file mode 100644
index 000000000..6641ad7a7
--- /dev/null
+++ b/compiler/luci-micro/README.md
@@ -0,0 +1,56 @@
+# luci-micro
+
+`luci-micro` is MCU specialized build of luci-interpreter with several benchmark applications.
+
+## Contents
+
+Luci-micro contains cmake infrastructure to build:
+- stand-alone interpreter library
+- benchmark applications using luci interpreter on arm MCUs
+
+## How to build stand alone library
+
+Stand-alone library is simply built by `luci_interpreter_micro_arm` target.
+Result library will be placed in  `<ONE root>/build/compiler/luci-micro/standalone_arm/luci-interpreter/src/libluci_interpreter.a`.
+
+### Prerequisites
+
+- Everything you need for ONE project: see [how-to-build-compiler.md](../../docs/howto/how-to-build-compiler.md)
+- arm-none-eabi-gcc and arm-none-eabi-g++ compilers
+
+To install needed arm compilers on ubuntu:
+```
+$ sudo apt-get install gcc-arm-none-eabi
+```
+
+**cmake build**
+
+``` bash
+$ cd <path to ONE>
+$ mkdir build
+# cd build
+$ cmake ../infra/nncc
+$ make -j$(nproc) luci_interpreter_micro_arm
+```
+
+**nncc script build**
+
+``` bash
+$ cd <path to ONE>
+$ ./nncc configure
+$ ./nncc build -j$(nproc) luci_interpreter_micro_arm
+```
+
+### Known issues
+
+Interpreter uses TensorFlow headers that produces warnings.
+
+`Linux` x86 build uses "-isystem" flag to suppress warnings from external sources,
+but some old arm compilers have issues with it:
+[bug](https://bugs.launchpad.net/gcc-arm-embedded/+bug/1698539)
+
+`-isystem` hack is disabled for MCU build, because of this MCU build is broken if `-Werror` flag is set.
+
+## How to use
+
+TBD
diff --git a/compiler/luci-micro/requires.cmake b/compiler/luci-micro/requires.cmake
new file mode 100644
index 000000000..5913aa9ad
--- /dev/null
+++ b/compiler/luci-micro/requires.cmake
@@ -0,0 +1 @@
+require(luci-interpreter)
diff --git a/compiler/luci-micro/standalone/CMakeLists.txt b/compiler/luci-micro/standalone/CMakeLists.txt
new file mode 100644
index 000000000..7953359ad
--- /dev/null
+++ b/compiler/luci-micro/standalone/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.10)
+project(luci_interpreter_micro_standalone)
+
+# Add fake target, so nothing is build
+set(BUILD_WHITELIST "dummy")
+
+add_subdirectory(${NNAS_ROOT}/infra/nncc ${CMAKE_CURRENT_BINARY_DIR}/nncc)
+
+set(ONE_COMPILER_SRC_DIR "${NNAS_PROJECT_SOURCE_DIR}/compiler")
+
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/loco ${CMAKE_CURRENT_BINARY_DIR}/loco)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/angkor ${CMAKE_CURRENT_BINARY_DIR}/angkor)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/oops ${CMAKE_CURRENT_BINARY_DIR}/oops)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/pepper-str ${CMAKE_CURRENT_BINARY_DIR}/pepper-str)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/logo ${CMAKE_CURRENT_BINARY_DIR}/logo)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/logo-core ${CMAKE_CURRENT_BINARY_DIR}/logo-core)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/locomotiv ${CMAKE_CURRENT_BINARY_DIR}/locomotiv)
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci/lang ${CMAKE_CURRENT_BINARY_DIR}/luci/lang)
+
+add_subdirectory(${ONE_COMPILER_SRC_DIR}/luci-interpreter ${CMAKE_CURRENT_BINARY_DIR}/luci-interpreter)
diff --git a/compiler/luci-micro/standalone/Toolchain.cmake b/compiler/luci-micro/standalone/Toolchain.cmake
new file mode 100644
index 000000000..2d23b5de5
--- /dev/null
+++ b/compiler/luci-micro/standalone/Toolchain.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_SYSTEM_PROCESSOR "${CPU_ARCH}")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_C_COMPILER "${C_COMPILER}")
+set(CMAKE_CXX_COMPILER "${CXX_COMPILER}")
+set(CMAKE_ASM_COMPILER "${ASM_COMPILER}")
+set(CMAKE_OBJCOPY "${OBJCOPY}")
diff --git a/compiler/luci-pass-value-test/CMakeLists.txt b/compiler/luci-pass-value-test/CMakeLists.txt
index 2d2befe57..b31415870 100644
--- a/compiler/luci-pass-value-test/CMakeLists.txt
+++ b/compiler/luci-pass-value-test/CMakeLists.txt
@@ -38,7 +38,7 @@ add_test(NAME luci_pass_value_test
   COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/eval_driver.sh"
           "${CMAKE_CURRENT_BINARY_DIR}"
           "${ARTIFACTS_BIN_PATH}"
-          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "${NNCC_OVERLAY_DIR}/venv_2_6_0"
           "$<TARGET_FILE:luci_eval_driver>"
           ${LUCI_PASS_VALUE_TESTS}
 )
diff --git a/compiler/luci-value-test/CMakeLists.txt b/compiler/luci-value-test/CMakeLists.txt
index 124f120d4..3c7185b80 100644
--- a/compiler/luci-value-test/CMakeLists.txt
+++ b/compiler/luci-value-test/CMakeLists.txt
@@ -18,7 +18,7 @@ add_test(NAME luci_value_test
   COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/evalverify.sh"
           "${CMAKE_CURRENT_BINARY_DIR}"
           "${ARTIFACTS_BIN_PATH}"
-          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "${NNCC_OVERLAY_DIR}/venv_2_6_0"
           "$<TARGET_FILE:luci_eval_driver>"
           ${LUCI_VALUE_TESTS}
 )
diff --git a/compiler/luci-value-test/README.md b/compiler/luci-value-test/README.md
index 90e92834b..6f1d0d54f 100644
--- a/compiler/luci-value-test/README.md
+++ b/compiler/luci-value-test/README.md
@@ -5,11 +5,15 @@
 The test proceeds as follows
 
 Step 1: Generate tflite files and circle files from TFLite recipes (listsed in test.lst).
+```
 "TFLite recipe" -> tflchef -> "tflite file" -> tflite2circle -> "circle file"
+```
 
 Step 2: Run TFLite interpreter and luci-interpreter for the generated tflite and circle, respectively.
 (with the same input tensors filled with random values)
+```
 circle file -> luci-interpreter -------> Execution result 1
 tflite file -> TFLite interpreter -----> Execution result 2
+```
 
 Step 3: Compare the execution result 1 and 2. The result must be the same.
diff --git a/compiler/luci-value-test/luci_eval_verifier.py b/compiler/luci-value-test/luci_eval_verifier.py
index f6b0620d8..a76bd1403 100755
--- a/compiler/luci-value-test/luci_eval_verifier.py
+++ b/compiler/luci-value-test/luci_eval_verifier.py
@@ -64,41 +64,23 @@ for idx in range(len(interpreter.get_output_details())):
     shape_file = open(circle_model + ".output" + str(idx) + ".shape", 'r')
     output_shape = [int(i) for i in shape_file.read().split(',')]
     luci_output_data = np.reshape(output_data, output_shape)
+    intp_output_data = interpreter.get_tensor(output_details["index"])
     try:
         if output_details["dtype"] == np.uint8:
-            if np.allclose(
-                    luci_output_data,
-                    interpreter.get_tensor(
-                        interpreter.get_output_details()[idx]["index"]),
-                    rtol=0,
-                    atol=0) == False:
+            if np.allclose(luci_output_data, intp_output_data, rtol=0, atol=0) == False:
                 raise SystemExit("Execution result of " + tflite_model +
                                  " does not match with " + circle_model)
         elif output_details["dtype"] == np.float32:
             if np.allclose(
-                    luci_output_data,
-                    interpreter.get_tensor(
-                        interpreter.get_output_details()[idx]["index"]),
-                    rtol=1.e-5,
-                    atol=1.e-5) == False:
+                    luci_output_data, intp_output_data, rtol=1.e-5, atol=1.e-5) == False:
                 raise SystemExit("Execution result of " + tflite_model +
                                  " does not match with " + circle_model)
         elif output_details["dtype"] == np.int64:
-            if np.allclose(
-                    luci_output_data,
-                    interpreter.get_tensor(
-                        interpreter.get_output_details()[idx]["index"]),
-                    rtol=0,
-                    atol=0) == False:
+            if np.allclose(luci_output_data, intp_output_data, rtol=0, atol=0) == False:
                 raise SystemExit("Execution result of " + tflite_model +
                                  " does not match with " + circle_model)
         elif output_details["dtype"] == np.int32:
-            if np.allclose(
-                    luci_output_data,
-                    interpreter.get_tensor(
-                        interpreter.get_output_details()[idx]["index"]),
-                    rtol=0,
-                    atol=0) == False:
+            if np.allclose(luci_output_data, intp_output_data, rtol=0, atol=0) == False:
                 raise SystemExit("Execution result of " + tflite_model +
                                  " does not match with " + circle_model)
         else:
diff --git a/compiler/luci/CMakeLists.txt b/compiler/luci/CMakeLists.txt
index 95c349c0d..9dcf1b55d 100644
--- a/compiler/luci/CMakeLists.txt
+++ b/compiler/luci/CMakeLists.txt
@@ -1,3 +1,14 @@
+# Some targets do not support dynamic linking: MCU, TrustZone applications, etc.
+# STATIC_LUCI option allows us to compile luci and luci related components safely
+# and suppress various cmake warnings.
+#
+# Currently this feature is used for luci-interpreter MCU builds.
+if (STATIC_LUCI)
+  set(LIBRARY_TYPE "STATIC")
+else()
+  set(LIBRARY_TYPE "SHARED")
+endif()
+
 add_subdirectory(env)
 add_subdirectory(log)
 add_subdirectory(lang)
@@ -6,6 +17,7 @@ add_subdirectory(testhelper)
 add_subdirectory(service)
 add_subdirectory(pass)
 add_subdirectory(profile)
+add_subdirectory(plan)
 add_subdirectory(partition)
 add_subdirectory(import)
 add_subdirectory(export)
diff --git a/compiler/luci/env/CMakeLists.txt b/compiler/luci/env/CMakeLists.txt
index 4d1a89ad1..bba515551 100644
--- a/compiler/luci/env/CMakeLists.txt
+++ b/compiler/luci/env/CMakeLists.txt
@@ -2,7 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_env SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_env ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_env PUBLIC include)
 target_link_libraries(luci_env PRIVATE nncc_common)
 install(TARGETS luci_env DESTINATION lib)
diff --git a/compiler/luci/env/include/luci/UserSettings.h b/compiler/luci/env/include/luci/UserSettings.h
index b56bd65e2..9fe9592e5 100644
--- a/compiler/luci/env/include/luci/UserSettings.h
+++ b/compiler/luci/env/include/luci/UserSettings.h
@@ -33,6 +33,7 @@ struct UserSettings
     MuteWarnings,
     DisableValidation,
     ProfilingDataGen,
+    ExecutionPlanGen,
   };
 
   static UserSettings *settings();
diff --git a/compiler/luci/env/src/UserSettings.cpp b/compiler/luci/env/src/UserSettings.cpp
index b4c661190..136fee799 100644
--- a/compiler/luci/env/src/UserSettings.cpp
+++ b/compiler/luci/env/src/UserSettings.cpp
@@ -31,6 +31,7 @@ private:
   bool _MuteWarnings{false};
   bool _DisableValidation{false};
   bool _ProfilingDataGen{false};
+  bool _ExecutionPlanGen{false};
 };
 
 void UserSettingsImpl::set(const Key key, bool value)
@@ -46,6 +47,9 @@ void UserSettingsImpl::set(const Key key, bool value)
     case Key::ProfilingDataGen:
       _ProfilingDataGen = value;
       break;
+    case Key::ExecutionPlanGen:
+      _ExecutionPlanGen = value;
+      break;
     default:
       throw std::runtime_error("Invalid key in boolean set");
       break;
@@ -62,6 +66,8 @@ bool UserSettingsImpl::get(const Key key) const
       return _DisableValidation;
     case Key::ProfilingDataGen:
       return _ProfilingDataGen;
+    case Key::ExecutionPlanGen:
+      return _ExecutionPlanGen;
     default:
       throw std::runtime_error("Invalid key in boolean get");
       break;
diff --git a/compiler/luci/env/src/UserSettings.test.cpp b/compiler/luci/env/src/UserSettings.test.cpp
index 899c0c2a1..26c606edb 100644
--- a/compiler/luci/env/src/UserSettings.test.cpp
+++ b/compiler/luci/env/src/UserSettings.test.cpp
@@ -39,6 +39,18 @@ TEST(UserSettings, MuteWarnings)
   ASSERT_TRUE(settings->get(luci::UserSettings::Key::MuteWarnings));
 }
 
+TEST(UserSettings, MuteWarnings_NEG)
+{
+  auto settings = luci::UserSettings::settings();
+  ASSERT_NE(nullptr, settings);
+
+  settings->set(luci::UserSettings::Key::MuteWarnings, false);
+  ASSERT_FALSE(settings->get(luci::UserSettings::Key::MuteWarnings));
+
+  settings->set(luci::UserSettings::Key::MuteWarnings, true);
+  ASSERT_FALSE(settings->get(luci::UserSettings::Key::DisableValidation));
+}
+
 TEST(UserSettings, DisableValidation)
 {
   auto settings = luci::UserSettings::settings();
@@ -51,6 +63,18 @@ TEST(UserSettings, DisableValidation)
   ASSERT_TRUE(settings->get(luci::UserSettings::Key::DisableValidation));
 }
 
+TEST(UserSettings, DisableValidation_NEG)
+{
+  auto settings = luci::UserSettings::settings();
+  ASSERT_NE(nullptr, settings);
+
+  settings->set(luci::UserSettings::Key::DisableValidation, false);
+  ASSERT_FALSE(settings->get(luci::UserSettings::Key::DisableValidation));
+
+  settings->set(luci::UserSettings::Key::DisableValidation, true);
+  ASSERT_FALSE(settings->get(luci::UserSettings::Key::ProfilingDataGen));
+}
+
 TEST(UserSettings, ProfilingDataGen)
 {
   auto settings = luci::UserSettings::settings();
diff --git a/compiler/luci/export/CMakeLists.txt b/compiler/luci/export/CMakeLists.txt
index 5c0077625..2b41a6248 100644
--- a/compiler/luci/export/CMakeLists.txt
+++ b/compiler/luci/export/CMakeLists.txt
@@ -3,7 +3,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 #file(GLOB_RECURSE TESTS "src/*.test.cpp")
 #list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_export SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+    set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_export ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_export PRIVATE src)
 target_include_directories(luci_export PUBLIC include)
 target_link_libraries(luci_export PRIVATE luci_lang)
@@ -14,6 +18,7 @@ target_link_libraries(luci_export PRIVATE luci_env)
 target_link_libraries(luci_export PRIVATE luci_log)
 target_link_libraries(luci_export PRIVATE luci_logex)
 target_link_libraries(luci_export PRIVATE luci_profile)
+target_link_libraries(luci_export PRIVATE luci_plan)
 target_link_libraries(luci_export PRIVATE nncc_common)
 target_link_libraries(luci_export PRIVATE locop)
 target_link_libraries(luci_export PRIVATE oops)
diff --git a/compiler/luci/export/src/CircleExportMetadata.cpp b/compiler/luci/export/src/CircleExportMetadata.cpp
index ef905a882..017002f5c 100644
--- a/compiler/luci/export/src/CircleExportMetadata.cpp
+++ b/compiler/luci/export/src/CircleExportMetadata.cpp
@@ -44,6 +44,31 @@ flatbuffers::Offset<circle::Metadata> metadata_offset(flatbuffers::FlatBufferBui
 namespace luci
 {
 
+// 'execution_plan_table' is encoded to binary format.
+const std::vector<uint8_t> CircleExportMetadata::encoded_execution_plan_table()
+{
+  std::vector<uint8_t> data;
+
+  write_u32(data, _execution_plan_table.size());
+
+  for (auto &kv : _execution_plan_table)
+  {
+    const auto id = kv.first;
+    write_u32(data, id);
+
+    const auto plan_vector = kv.second;
+    const auto size = plan_vector.size();
+    write_u32(data, size);
+
+    for (auto elem : plan_vector)
+    {
+      write_u32(data, elem);
+    }
+  }
+
+  return data;
+}
+
 // 'source_table' is encoded to binary format.
 const std::vector<uint8_t> CircleExportMetadata::encoded_source_table(void)
 {
@@ -114,7 +139,11 @@ createCircleMetadataVector(flatbuffers::FlatBufferBuilder &builder, luci::Serial
     metadata_vec.emplace_back(
       metadata_offset(builder, md, md._metadata.encoded_op_table(), "ONE_op_table"));
   }
-
+  if (settings->get(luci::UserSettings::Key::ExecutionPlanGen))
+  {
+    metadata_vec.emplace_back(metadata_offset(
+      builder, md, md._metadata.encoded_execution_plan_table(), "ONE_execution_plan_table"));
+  }
   return metadata_vec;
 }
 
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 014d9bd61..be64a52d4 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -22,6 +22,7 @@
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
 #include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Plan/CircleNodeExecutionPlan.h>
 #include <luci/UserSettings.h>
 #include <luci/Log.h>
 
@@ -1684,7 +1685,7 @@ void OpExporterLet<OE::CIRC>::visit(luci::CircleInstanceNorm *node)
 }
 
 void exportNode(loco::Node *node, flatbuffers::FlatBufferBuilder &builder, SerializedModelData &md,
-                SerializedGraphData &gd)
+                SerializedGraphData &gd, uint32_t node_position)
 {
   if (auto circle_node = dynamic_cast<luci::CircleNode *>(node))
   {
@@ -1702,6 +1703,19 @@ void exportNode(loco::Node *node, flatbuffers::FlatBufferBuilder &builder, Seria
         md._metadata.add_op_table(node_id, source->id());
       }
     }
+    if (has_execution_plan(circle_node))
+    {
+      // Add to node (in node_position) metadata vector with execution_plan information:
+      // order of execution, and offsets output tensors.
+      const auto execution_plan = get_execution_plan(circle_node);
+      std::vector<uint32_t> execution_plan_vector;
+      execution_plan_vector.push_back(execution_plan.order_in_plan());
+      for (auto offset : execution_plan.offsets())
+      {
+        execution_plan_vector.push_back(offset);
+      }
+      md._metadata.add_execution_plan_table(node_position, execution_plan_vector);
+    }
   }
   else
   {
@@ -1717,9 +1731,11 @@ namespace luci
 void exportNodes(loco::Graph *g, FlatBufferBuilder &builder, SerializedModelData &md,
                  SerializedGraphData &gd)
 {
+  uint32_t node_position = 0;
   for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
   {
-    exportNode(node, builder, md, gd);
+    exportNode(node, builder, md, gd, node_position);
+    node_position++;
   }
 }
 
diff --git a/compiler/luci/export/src/SerializedData.h b/compiler/luci/export/src/SerializedData.h
index 95f7b5755..a945eecf7 100644
--- a/compiler/luci/export/src/SerializedData.h
+++ b/compiler/luci/export/src/SerializedData.h
@@ -20,6 +20,7 @@
 #include <mio/circle/schema_generated.h>
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/IR/ExecutionPlanTable.h>
 
 #include <vector>
 
@@ -63,13 +64,23 @@ public:
     _op_table.at(node_id).emplace(source_id);
   }
 
+  void add_execution_plan_table(uint32_t node_id,
+                                const std::vector<uint32_t> &execution_plan_inform)
+  {
+    _execution_plan_table[node_id] = execution_plan_inform;
+  }
+
 public:
   const std::vector<uint8_t> encoded_source_table(void);
   const std::vector<uint8_t> encoded_op_table(void);
+  const std::vector<uint8_t> encoded_execution_plan_table(void);
 
 private:
   std::map<uint32_t, std::string> _source_table;
   std::map<uint32_t, std::set<uint32_t>> _op_table;
+  // _exec_plan_table stores for node with node_id order of execution, and memory offsets:
+  // first go execution order, then memory offsets for node output tensors.
+  luci::ExecutionPlanTable _execution_plan_table;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/CMakeLists.txt b/compiler/luci/import/CMakeLists.txt
index 4e200f6ae..1df569d11 100644
--- a/compiler/luci/import/CMakeLists.txt
+++ b/compiler/luci/import/CMakeLists.txt
@@ -2,11 +2,16 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_import SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_import ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_import PRIVATE src)
 target_include_directories(luci_import PUBLIC include)
 target_link_libraries(luci_import PUBLIC luci_lang)
 target_link_libraries(luci_import PUBLIC luci_profile)
+target_link_libraries(luci_import PUBLIC luci_plan)
 target_link_libraries(luci_import PUBLIC mio_circle)
 target_link_libraries(luci_import PRIVATE luci_env)
 target_link_libraries(luci_import PRIVATE luci_log)
diff --git a/compiler/luci/import/src/CircleImportMetadata.cpp b/compiler/luci/import/src/CircleImportMetadata.cpp
index f68f3301a..42dcebdaa 100644
--- a/compiler/luci/import/src/CircleImportMetadata.cpp
+++ b/compiler/luci/import/src/CircleImportMetadata.cpp
@@ -134,6 +134,55 @@ decoded_op_table(const std::vector<uint8_t> &op_table_data)
   return node_source_ids_map;
 }
 
+// 'execution_plan_table' is decoded to std::map<uint32_t, std::vector<uint32_t>> format.
+const luci::ExecutionPlanTable
+decoded_execution_plan(const std::vector<uint8_t> &execution_plan_data)
+{
+  luci::ExecutionPlanTable execution_plan_table;
+  uint32_t idx = 0;
+
+  if (execution_plan_data.size() < 4)
+    throw std::runtime_error("Op table decode error : invalid entry number");
+
+  uint32_t entry_number = read_u32(execution_plan_data, idx);
+  idx += sizeof(uint32_t);
+
+  while (idx < execution_plan_data.size())
+  {
+    if (idx + 2 * sizeof(uint32_t) > execution_plan_data.size())
+      throw std::runtime_error("Op table decode error : invalid entry item");
+
+    uint32_t id = read_u32(execution_plan_data, idx);
+    idx += sizeof(uint32_t);
+
+    uint32_t size = read_u32(execution_plan_data, idx);
+    idx += sizeof(uint32_t);
+
+    if (idx + sizeof(uint32_t) * size > execution_plan_data.size())
+      throw std::runtime_error("Source table decode error : invalid entry data");
+
+    std::vector<uint32_t> execution_plan_vector;
+    for (uint32_t j = 0; j < size; ++j)
+    {
+      uint32_t execution_plan_inform = read_u32(execution_plan_data, idx);
+      idx += sizeof(uint32_t);
+
+      execution_plan_vector.push_back(execution_plan_inform);
+    }
+
+    if (execution_plan_table.insert({id, execution_plan_vector}).second == false)
+      throw std::runtime_error("Op table decode error : duplicated origin ID");
+  }
+
+  if (idx != execution_plan_data.size())
+    throw std::runtime_error("Op table decode error : data size invalid");
+
+  if (execution_plan_table.size() != entry_number)
+    throw std::runtime_error("Op table decode error : entry number invalid");
+
+  return execution_plan_table;
+}
+
 } // namespace
 
 namespace luci
@@ -153,6 +202,8 @@ CircleImportMetadata::CircleImportMetadata(const luci::CircleReader &reader)
       _op_table = decoded_op_table(buffer);
     else if (meta.name.compare("ONE_source_table") == 0)
       _source_table = decoded_source_table(buffer);
+    else if (meta.name.compare("ONE_execution_plan_table") == 0)
+      _execution_plan_table = decoded_execution_plan(buffer);
   }
 }
 
diff --git a/compiler/luci/import/src/CircleImportMetadata.h b/compiler/luci/import/src/CircleImportMetadata.h
index 007985dcc..0e0240678 100644
--- a/compiler/luci/import/src/CircleImportMetadata.h
+++ b/compiler/luci/import/src/CircleImportMetadata.h
@@ -20,6 +20,7 @@
 #include "luci/Import/CircleReader.h"
 
 #include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/IR/ExecutionPlanTable.h>
 
 #include <map>
 #include <set>
@@ -47,10 +48,15 @@ public:
 
   const std::map<uint32_t, std::string> &source_table(void) const { return _source_table; }
 
+  const luci::ExecutionPlanTable &execution_plan_table(void) const { return _execution_plan_table; }
+
 private:
   // Decoded metadata is stored
   std::map<uint32_t, std::string> _source_table;
   std::map<uint32_t, std::set<uint32_t>> _op_table;
+  // _execution_plan_table stores for node with node_id order of execution,
+  // and offsets output tensors
+  luci::ExecutionPlanTable _execution_plan_table;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Importer.cpp b/compiler/luci/import/src/Importer.cpp
index 68baefab0..8eae5fcf4 100644
--- a/compiler/luci/import/src/Importer.cpp
+++ b/compiler/luci/import/src/Importer.cpp
@@ -28,6 +28,7 @@
 #include <luci/IR/CircleNodes.h>
 #include <luci/Profile/CircleNodeID.h>
 #include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Plan/CircleNodeExecutionPlan.h>
 #include <luci/Log.h>
 #include <luci/LogHelper.h>
 
@@ -344,6 +345,25 @@ std::unique_ptr<Module> Importer::importModule(const circle::Model *model) const
     module->source_table(table);
   }
 
+  // Add execution_plan annotations
+  if (circle_metadata->execution_plan_table().size() > 0)
+  {
+    auto execution_plan_table = circle_metadata->execution_plan_table();
+    auto node_position = 0;
+    for (auto node : loco::postorder_traversal(loco::output_nodes(module->graph())))
+    {
+      if (auto circle_node = dynamic_cast<luci::CircleNode *>(node))
+      {
+        auto node_plan = execution_plan_table[node_position];
+        luci::add_execution_plan(
+          circle_node,
+          luci::CircleNodeExecutionPlan(
+            node_plan[0], std::vector<uint32_t>(node_plan.begin() + 1, node_plan.end())));
+      }
+      node_position++;
+    }
+  }
+
   return module;
 }
 
diff --git a/compiler/luci/lang/CMakeLists.txt b/compiler/luci/lang/CMakeLists.txt
index 669a866b1..433b7cd4e 100644
--- a/compiler/luci/lang/CMakeLists.txt
+++ b/compiler/luci/lang/CMakeLists.txt
@@ -2,7 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_lang SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_lang ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_lang PRIVATE src)
 target_include_directories(luci_lang PUBLIC include)
 target_link_libraries(luci_lang PUBLIC loco)
diff --git a/compiler/luci/lang/include/luci/IR/ExecutionPlanTable.h b/compiler/luci/lang/include/luci/IR/ExecutionPlanTable.h
new file mode 100644
index 000000000..5c33c1123
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/ExecutionPlanTable.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_EXECUTION_PLAN_TABLE_H__
+#define __LUCI_EXECUTION_PLAN_TABLE_H__
+
+namespace luci
+{
+
+using ExecutionPlanTable = std::map<uint32_t, std::vector<uint32_t>>;
+
+} // namespace luci
+
+#endif // __LUCI_EXECUTION_PLAN_TABLE_H__
diff --git a/compiler/luci/log/CMakeLists.txt b/compiler/luci/log/CMakeLists.txt
index 23bd00828..b64a0651e 100644
--- a/compiler/luci/log/CMakeLists.txt
+++ b/compiler/luci/log/CMakeLists.txt
@@ -1,7 +1,11 @@
 # TODO Find how to test logging framework
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
-add_library(luci_log SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+    set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_log ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_log PUBLIC include)
 target_link_libraries(luci_log PUBLIC hermes)
 target_link_libraries(luci_log PRIVATE hermes_std)
diff --git a/compiler/luci/logex/CMakeLists.txt b/compiler/luci/logex/CMakeLists.txt
index cd2571ba1..4d801b046 100644
--- a/compiler/luci/logex/CMakeLists.txt
+++ b/compiler/luci/logex/CMakeLists.txt
@@ -1,7 +1,11 @@
 # TODO Find how to test logging-ex utility
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
-add_library(luci_logex SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+    set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_logex ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_logex PUBLIC include)
 target_link_libraries(luci_logex PUBLIC loco)
 target_link_libraries(luci_logex PUBLIC locop)
diff --git a/compiler/luci/partition/CMakeLists.txt b/compiler/luci/partition/CMakeLists.txt
index 236b689c4..eacbe1ccc 100644
--- a/compiler/luci/partition/CMakeLists.txt
+++ b/compiler/luci/partition/CMakeLists.txt
@@ -2,7 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_partition SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_partition ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_partition PRIVATE src)
 target_include_directories(luci_partition PUBLIC include)
 target_link_libraries(luci_partition PUBLIC luci_lang)
diff --git a/compiler/luci/pass/CMakeLists.txt b/compiler/luci/pass/CMakeLists.txt
index fd06c6d52..2361bb4f5 100644
--- a/compiler/luci/pass/CMakeLists.txt
+++ b/compiler/luci/pass/CMakeLists.txt
@@ -1,8 +1,18 @@
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
+if(NOT FlatBuffers_FOUND)
+  message(STATUS "FlatBuffers NOT FOUND")
+  return()
+endif(NOT FlatBuffers_FOUND)
+
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_pass SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_pass ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_pass PRIVATE src)
 target_include_directories(luci_pass PUBLIC include)
 target_link_libraries(luci_pass PUBLIC loco)
@@ -13,9 +23,11 @@ target_link_libraries(luci_pass PRIVATE luci_log)
 target_link_libraries(luci_pass PRIVATE luci_service)
 target_link_libraries(luci_pass PRIVATE luci_logex)
 target_link_libraries(luci_pass PRIVATE luci_profile)
+target_link_libraries(luci_pass PRIVATE mio_tflite260_inc)
 target_link_libraries(luci_pass PRIVATE nncc_common)
 target_link_libraries(luci_pass PRIVATE pepper_csv2vec)
 target_link_libraries(luci_pass PRIVATE oops)
+target_link_libraries(luci_pass PRIVATE flatbuffers-1.12)
 install(TARGETS luci_pass DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include
         FILES_MATCHING PATTERN "*.h")
@@ -31,4 +43,5 @@ target_include_directories(luci_pass_test PRIVATE src)
 target_link_libraries(luci_pass_test luci_pass)
 target_link_libraries(luci_pass_test luci_lang)
 target_link_libraries(luci_pass_test luci_testhelper)
+target_link_libraries(luci_pass_test flatbuffers-1.12)
 #target_link_libraries(luci_pass_test oops)
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index 3bcc7c5bb..917cacae9 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -34,6 +34,7 @@ public:
   {
     enum Algorithm
     {
+      FuseAddWithFullyConnected,
       FuseAddWithTConv,
       FuseBatchNormWithConv,
       FuseBatchNormWithDwConv,
@@ -51,8 +52,10 @@ public:
       Requantize,
       FoldAddV2,
       FoldCast,
+      FoldDepthwiseConv2D,
       FoldDequantize,
       FoldSparseToDense,
+      ForceQuantParam,
       ForwardReshapeToUnaryOp,
       SparsifyTensorPass,
       FusePreActivationBatchNorm,
@@ -64,7 +67,9 @@ public:
       ReplaceSubWithAdd,
       SubstitutePackToReshape,
       SubstitutePadV2ToPad,
+      SubstituteSplitVToSplit,
       SubstituteSqueezeToReshape,
+      ExpandBroadcastConst,
       ConvertNCHWToNHWC,
       RemoveUnnecessarySlice,
       RemoveUnnecessaryStridedSlice,
@@ -82,9 +87,12 @@ public:
     enum AlgorithmParameters
     {
       // quantize
-      Quantize_input_dtype,
-      Quantize_output_dtype,
+      Quantize_input_model_dtype,
+      Quantize_output_model_dtype,
       Quantize_granularity, // layer-wise or channel-wise
+      Quantize_tensor_names,
+      Quantize_scales,
+      Quantize_zero_points,
 
       // sparsify
       Sparsify_tensor_name,
@@ -96,6 +104,9 @@ public:
       // convert NCHW to NHWC
       NCHW_to_NHWC_input_shape,
       NCHW_to_NHWC_output_shape,
+
+      Quantize_input_dtype = Quantize_input_model_dtype,   // TODO Remove this
+      Quantize_output_dtype = Quantize_output_model_dtype, // TODO Remove this
     };
 
     virtual ~Options() = default;
@@ -104,6 +115,8 @@ public:
     virtual bool query(Algorithm) = 0;
     virtual void param(AlgorithmParameters, const std::string &) = 0;
     virtual const std::string param(AlgorithmParameters) const = 0;
+    virtual void params(AlgorithmParameters, std::vector<std::string> &) = 0;
+    virtual std::vector<std::string> params(AlgorithmParameters) const = 0;
   };
 
 public:
diff --git a/compiler/luci/pass/include/luci/Pass/ExpandBroadcastConstPass.h b/compiler/luci/pass/include/luci/Pass/ExpandBroadcastConstPass.h
new file mode 100644
index 000000000..5ee26b472
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ExpandBroadcastConstPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_EXPAND_BROADCAST_CONST_PASS_H__
+#define __LUCI_EXPAND_BROADCAST_CONST_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to remove broadcasts of Const nodes.
+ */
+struct ExpandBroadcastConstPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ExpandBroadcastConstPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_EXPAND_BROADCAST_CONST_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FoldDepthwiseConv2DPass.h b/compiler/luci/pass/include/luci/Pass/FoldDepthwiseConv2DPass.h
new file mode 100644
index 000000000..58e5b71a7
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldDepthwiseConv2DPass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_DEPTHWISE_CONV_2D_PASS_H__
+#define __LUCI_FOLD_DEPTHWISE_CONV_2D_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fold DepthwiseConv2D with constant input and filter into a
+ * constant tensor
+ */
+struct FoldDepthwiseConv2DPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FoldDepthwiseConv2DPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_DEPTHWISE_CONV_2D_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ForceQuantParamPass.h b/compiler/luci/pass/include/luci/Pass/ForceQuantParamPass.h
new file mode 100644
index 000000000..752ce1d31
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ForceQuantParamPass.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FORCE_QUANT_PARAM_PASS_H__
+#define __LUCI_FORCE_QUANT_PARAM_PASS_H__
+
+#include <loco.h>
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Pass to write quantparam (scale, zerop) to the specified tensors
+ */
+class ForceQuantParamPass : public logo::Pass
+{
+public:
+  using TensorVector = std::vector<std::string>;
+  using ScaleVector = std::vector<float>;
+  using ZPVector = std::vector<int64_t>;
+
+public:
+  ForceQuantParamPass(TensorVector &tensors, ScaleVector &scales, ZPVector &zerops)
+    : _tensors{tensors}, _scales{scales}, _zerops{zerops}
+  {
+    // DO NOTHING
+  }
+  virtual const char *name(void) const { return "luci::ForceQuantParamPass"; }
+
+public:
+  bool run(loco::Graph *graph);
+
+private:
+  TensorVector _tensors;
+  ScaleVector _scales;
+  ZPVector _zerops;
+};
+
+} // namespace luci
+
+#endif //__LUCI_FORCE_QUANT_PARAM_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseAddWithFullyConnectedPass.h b/compiler/luci/pass/include/luci/Pass/FuseAddWithFullyConnectedPass.h
new file mode 100644
index 000000000..a59b644e9
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseAddWithFullyConnectedPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_ADD_WITH_FULLY_CONNECTED_PASS_H__
+#define __LUCI_FUSE_ADD_WITH_FULLY_CONNECTED_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Add into FullyConnected
+ */
+struct FuseAddWithFullyConnectedPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseAddWithFullyConnectedPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_ADD_WITH_FULLY_CONNECTED_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h b/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
index 78e7323f9..68765ec5b 100644
--- a/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
+++ b/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
@@ -32,9 +32,10 @@ namespace luci
 class QuantizeDequantizeWeightsPass : public logo::Pass
 {
 public:
-  QuantizeDequantizeWeightsPass(loco::DataType input_dtype, loco::DataType output_dtype,
+  QuantizeDequantizeWeightsPass(loco::DataType input_model_dtype, loco::DataType output_model_dtype,
                                 QuantizationGranularity granularity)
-    : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
+    : _input_model_dtype{input_model_dtype}, _output_model_dtype{output_model_dtype}, _granularity{
+                                                                                        granularity}
   {
     // DO NOTHING
   }
@@ -44,8 +45,8 @@ public:
   bool run(loco::Graph *graph);
 
 private:
-  loco::DataType _input_dtype;
-  loco::DataType _output_dtype;
+  loco::DataType _input_model_dtype;
+  loco::DataType _output_model_dtype;
   QuantizationGranularity _granularity;
 };
 
diff --git a/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h b/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
index 9520910d5..d618a07b6 100644
--- a/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
+++ b/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
@@ -32,9 +32,10 @@ namespace luci
 class QuantizeWithMinMaxPass : public logo::Pass
 {
 public:
-  QuantizeWithMinMaxPass(loco::DataType input_dtype, loco::DataType output_dtype,
+  QuantizeWithMinMaxPass(loco::DataType input_model_dtype, loco::DataType output_model_dtype,
                          QuantizationGranularity granularity)
-    : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
+    : _input_model_dtype{input_model_dtype}, _output_model_dtype{output_model_dtype}, _granularity{
+                                                                                        granularity}
   {
     // DO NOTHING
   }
@@ -44,8 +45,8 @@ public:
   bool run(loco::Graph *graph);
 
 private:
-  loco::DataType _input_dtype;
-  loco::DataType _output_dtype;
+  loco::DataType _input_model_dtype;
+  loco::DataType _output_model_dtype;
   QuantizationGranularity _granularity;
 };
 
diff --git a/compiler/luci/pass/include/luci/Pass/SubstituteSplitVToSplitPass.h b/compiler/luci/pass/include/luci/Pass/SubstituteSplitVToSplitPass.h
new file mode 100644
index 000000000..8c8900159
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/SubstituteSplitVToSplitPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SUBSTITUTE_SPLIT_V_TO_SPLIT_PASS_H__
+#define __LUCI_SUBSTITUTE_SPLIT_V_TO_SPLIT_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to substitute certain SplitV to Split.
+ */
+struct SubstituteSplitVToSplitPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::SubstituteSplitVToSplitPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_SUBSTITUTE_SPLIT_V_TO_SPLIT_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 98c22a07a..5d0c92625 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -17,12 +17,16 @@
 #include "luci/CircleOptimizer.h"
 
 #include "luci/Pass/ConvertNCHWToNHWCPass.h"
+#include "luci/Pass/ExpandBroadcastConstPass.h"
 #include "luci/Pass/FoldAddV2Pass.h"
 #include "luci/Pass/FoldCastPass.h"
+#include "luci/Pass/FoldDepthwiseConv2DPass.h"
 #include "luci/Pass/FoldDequantizePass.h"
 #include "luci/Pass/FoldSparseToDensePass.h"
 #include "luci/Pass/ForwardReshapeToUnaryOpPass.h"
+#include "luci/Pass/ForceQuantParamPass.h"
 #include "luci/Pass/FuseActivationFunctionPass.h"
+#include "luci/Pass/FuseAddWithFullyConnectedPass.h"
 #include "luci/Pass/FuseAddWithTConvPass.h"
 #include "luci/Pass/FuseBatchNormWithConvPass.h"
 #include "luci/Pass/FuseBatchNormWithDwConvPass.h"
@@ -55,6 +59,7 @@
 #include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
 #include "luci/Pass/SubstitutePackToReshapePass.h"
 #include "luci/Pass/SubstitutePadV2ToPadPass.h"
+#include "luci/Pass/SubstituteSplitVToSplitPass.h"
 #include "luci/Pass/SubstituteSqueezeToReshapePass.h"
 #include "luci/Pass/SubstituteStridedSliceToReshapePass.h"
 #include "luci/Pass/SubstituteTransposeToReshapePass.h"
@@ -86,17 +91,37 @@ namespace
 
 using namespace luci;
 
+template <typename T> T lexical_cast(const std::string &str)
+{
+  std::istringstream ss;
+  ss.str(str);
+  T data;
+  ss >> data;
+  return data;
+}
+
+template <typename T> std::vector<T> lexical_cast(std::vector<std::string> &sv)
+{
+  std::vector<T> result;
+  std::transform(sv.begin(), sv.end(), std::back_inserter(result),
+                 [](std::string str) -> T { return lexical_cast<T>(str); });
+  return result;
+}
+
 class OptimizeOptionsImpl final : public luci::CircleOptimizer::Options
 {
 public:
   void enable(Algorithm) final;
   void param(AlgorithmParameters, const std::string &) final;
   const std::string param(AlgorithmParameters) const final;
+  void params(AlgorithmParameters, std::vector<std::string> &) final;
+  std::vector<std::string> params(AlgorithmParameters) const final;
   bool query(Algorithm) final;
 
 private:
   std::vector<Algorithm> _algorithms;
   std::map<AlgorithmParameters, const std::string> _algorithm_params;
+  std::map<AlgorithmParameters, std::vector<std::string>> _multiple_params;
 };
 
 void OptimizeOptionsImpl::enable(Algorithm algo) { _algorithms.push_back(algo); }
@@ -119,6 +144,24 @@ const std::string OptimizeOptionsImpl::param(AlgorithmParameters param) const
   }
 }
 
+void OptimizeOptionsImpl::params(AlgorithmParameters param, std::vector<std::string> &vec)
+{
+  _multiple_params[param] = vec;
+}
+
+std::vector<std::string> OptimizeOptionsImpl::params(AlgorithmParameters param) const
+{
+  auto param_vec = _multiple_params.find(param);
+  if (param_vec != _multiple_params.end())
+  {
+    return param_vec->second;
+  }
+  else
+  {
+    return std::vector<std::string>();
+  }
+}
+
 bool OptimizeOptionsImpl::query(Algorithm algo)
 {
   std::vector<Algorithm>::iterator it = std::find(_algorithms.begin(), _algorithms.end(), algo);
@@ -237,6 +280,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseBatchNormWithTConvPass>());
   }
+  if (_options->query(Options::Algorithm::FuseAddWithFullyConnected))
+  {
+    phase.emplace_back(std::make_unique<FuseAddWithFullyConnectedPass>());
+  }
   if (_options->query(Options::Algorithm::FuseAddWithTConv))
   {
     phase.emplace_back(std::make_unique<FuseAddWithTConvPass>());
@@ -257,6 +304,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::FoldCastPass>());
   }
+  if (_options->query(Options::Algorithm::FoldDepthwiseConv2D))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldDepthwiseConv2DPass>());
+  }
   if (_options->query(Options::Algorithm::FoldDequantize))
   {
     phase.emplace_back(std::make_unique<luci::FoldDequantizePass>());
@@ -281,6 +332,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::ShuffleWeightTo16x1Float32Pass>());
   }
+  if (_options->query(Options::Algorithm::ExpandBroadcastConst))
+  {
+    phase.emplace_back(std::make_unique<luci::ExpandBroadcastConstPass>());
+  }
   if (_options->query(Options::Algorithm::RemoveFakeQuant))
   {
     phase.emplace_back(std::make_unique<luci::RemoveFakeQuantPass>());
@@ -329,6 +384,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::SubstitutePadV2ToPadPass>());
   }
+  if (_options->query(Options::Algorithm::SubstituteSplitVToSplit))
+  {
+    phase.emplace_back(std::make_unique<luci::SubstituteSplitVToSplitPass>());
+  }
   if (_options->query(Options::Algorithm::SubstituteSqueezeToReshape))
   {
     phase.emplace_back(std::make_unique<luci::SubstituteSqueezeToReshapePass>());
@@ -363,28 +422,30 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   // Fake quantization of weights
   if (_options->query(Options::Algorithm::QuantizeDequantizeWeights))
   {
-    static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
-    static const std::vector<std::string> fakeq_supported_output_dtype{"uint8", "int16"};
+    static const std::vector<std::string> fakeq_supported_input_model_dtype{"float32"};
+    static const std::vector<std::string> fakeq_supported_output_model_dtype{"uint8", "int16"};
     static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
 
-    auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
-    auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+    auto input_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_input_model_dtype);
+    auto output_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_output_model_dtype);
     auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity);
 
-    if (!in_array(to_lower_case(input_dtype), fakeq_supported_input_dtype))
+    if (!in_array(to_lower_case(input_model_dtype), fakeq_supported_input_model_dtype))
       throw std::runtime_error("Unsupported input type. List of supported input type: " +
-                               to_string(fakeq_supported_input_dtype));
+                               to_string(fakeq_supported_input_model_dtype));
 
-    if (!in_array(to_lower_case(output_dtype), fakeq_supported_output_dtype))
+    if (!in_array(to_lower_case(output_model_dtype), fakeq_supported_output_model_dtype))
       throw std::runtime_error("Unsupported output type. List of supported output type: " +
-                               to_string(fakeq_supported_output_dtype));
+                               to_string(fakeq_supported_output_model_dtype));
 
     if (!in_array(to_lower_case(granularity), fakeq_supported_granularity))
       throw std::runtime_error("Unsupported granularity. List of supported granularity: " +
                                to_string(fakeq_supported_granularity));
 
     if (str_to_granularity(granularity) == QuantizationGranularity::LayerWise &&
-        str_to_dtype(output_dtype) != loco::DataType::U8)
+        str_to_dtype(output_model_dtype) != loco::DataType::U8)
       throw std::runtime_error("Layer-wise quantization only supports uint8 dtype.");
 
     // Clear existing quantparams before doing fake quantization
@@ -395,39 +456,43 @@ void CircleOptimizer::quantize(loco::Graph *g) const
         circle_node->quantparam(nullptr);
     }
 
-    luci::QuantizeDequantizeWeightsPass fake_quantizer(
-      str_to_dtype(input_dtype), str_to_dtype(output_dtype), str_to_granularity(granularity));
+    luci::QuantizeDequantizeWeightsPass fake_quantizer(str_to_dtype(input_model_dtype),
+                                                       str_to_dtype(output_model_dtype),
+                                                       str_to_granularity(granularity));
     fake_quantizer.run(g);
   }
 
   // Actual quantization of weights, bias, and activation
   if (_options->query(Options::Algorithm::QuantizeWithMinMax))
   {
-    static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
-    static const std::vector<std::string> qwmm_supported_output_dtype{"uint8", "int16"};
+    static const std::vector<std::string> qwmm_supported_input_model_dtype{"float32"};
+    static const std::vector<std::string> qwmm_supported_output_model_dtype{"uint8", "int16"};
     static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
 
-    auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
-    auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+    auto input_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_input_model_dtype);
+    auto output_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_output_model_dtype);
     auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity);
 
-    if (!in_array(to_lower_case(input_dtype), qwmm_supported_input_dtype))
+    if (!in_array(to_lower_case(input_model_dtype), qwmm_supported_input_model_dtype))
       throw std::runtime_error("Unsupported input type. List of supported input types: " +
-                               to_string(qwmm_supported_input_dtype));
+                               to_string(qwmm_supported_input_model_dtype));
 
-    if (!in_array(to_lower_case(output_dtype), qwmm_supported_output_dtype))
+    if (!in_array(to_lower_case(output_model_dtype), qwmm_supported_output_model_dtype))
       throw std::runtime_error("Unsupported output type. List of supported output types: " +
-                               to_string(qwmm_supported_output_dtype));
+                               to_string(qwmm_supported_output_model_dtype));
 
     if (!in_array(to_lower_case(granularity), qwmm_supported_granularity))
       throw std::runtime_error("Unsupported granularity. List of supported granularity: " +
                                to_string(qwmm_supported_granularity));
 
     if (str_to_granularity(granularity) == QuantizationGranularity::LayerWise &&
-        str_to_dtype(output_dtype) != loco::DataType::U8)
+        str_to_dtype(output_model_dtype) != loco::DataType::U8)
       throw std::runtime_error("Layer-wise quantization only supports uint8 dtype.");
 
-    luci::QuantizeWithMinMaxPass quantizer(str_to_dtype(input_dtype), str_to_dtype(output_dtype),
+    luci::QuantizeWithMinMaxPass quantizer(str_to_dtype(input_model_dtype),
+                                           str_to_dtype(output_model_dtype),
                                            str_to_granularity(granularity));
     quantizer.run(g);
 
@@ -446,7 +511,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
     phase_runner.run(phase);
 
     // Verify the type/granularity of the quantized model
-    luci::QuantizedModelVerifier verifier(str_to_dtype(output_dtype),
+    luci::QuantizedModelVerifier verifier(str_to_dtype(output_model_dtype),
                                           str_to_granularity(granularity));
     verifier.verify(g);
   }
@@ -454,24 +519,44 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   // Requantize
   if (_options->query(Options::Algorithm::Requantize))
   {
-    static const std::vector<std::string> rq_supported_input_dtype{"int8"};
-    static const std::vector<std::string> rq_supported_output_dtype{"uint8"};
+    static const std::vector<std::string> rq_supported_input_model_dtype{"int8"};
+    static const std::vector<std::string> rq_supported_output_model_dtype{"uint8"};
 
-    auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
-    auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+    auto input_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_input_model_dtype);
+    auto output_model_dtype =
+      _options->param(Options::AlgorithmParameters::Quantize_output_model_dtype);
 
-    if (!in_array(to_lower_case(input_dtype), rq_supported_input_dtype))
+    if (!in_array(to_lower_case(input_model_dtype), rq_supported_input_model_dtype))
       throw std::runtime_error("Unsupported input type. List of supported input types: " +
-                               to_string(rq_supported_input_dtype));
+                               to_string(rq_supported_input_model_dtype));
 
-    if (!in_array(to_lower_case(output_dtype), rq_supported_output_dtype))
+    if (!in_array(to_lower_case(output_model_dtype), rq_supported_output_model_dtype))
       throw std::runtime_error("Unsupported output type. List of supported output types: " +
-                               to_string(rq_supported_output_dtype));
+                               to_string(rq_supported_output_model_dtype));
 
-    luci::RequantizePass requantizer(str_to_dtype(input_dtype), str_to_dtype(output_dtype));
+    luci::RequantizePass requantizer(str_to_dtype(input_model_dtype),
+                                     str_to_dtype(output_model_dtype));
     requantizer.run(g);
   }
 
+  // Force to write quantparam to specified tensors
+  // NOTE Only per-tensor (not per-channel) qparam can be written
+  if (_options->query(Options::Algorithm::ForceQuantParam))
+  {
+    ForceQuantParamPass::TensorVector tensors =
+      _options->params(Options::AlgorithmParameters::Quantize_tensor_names);
+    auto str_scales = _options->params(Options::AlgorithmParameters::Quantize_scales);
+    auto str_zero_points = _options->params(Options::AlgorithmParameters::Quantize_zero_points);
+
+    // Cast scales/zero_points to proper types
+    ForceQuantParamPass::ScaleVector scales = lexical_cast<float>(str_scales);
+    ForceQuantParamPass::ZPVector zero_points = lexical_cast<int64_t>(str_zero_points);
+
+    ForceQuantParamPass fq(tensors, scales, zero_points);
+    fq.run(g);
+  }
+
   logo::Phase phase;
 
   // Do Shape/Type inference
diff --git a/compiler/luci/pass/src/CircleOptimizer.test.cpp b/compiler/luci/pass/src/CircleOptimizer.test.cpp
index 43d96feaf..a1b5c7f80 100644
--- a/compiler/luci/pass/src/CircleOptimizer.test.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.test.cpp
@@ -33,6 +33,7 @@ TEST(CircleOptimizerTest, optimize_algorithms)
   // TODO add more if needed
   options->enable(Algorithms::FoldAddV2);
   options->enable(Algorithms::FoldCast);
+  options->enable(Algorithms::FoldDepthwiseConv2D);
   options->enable(Algorithms::FoldDequantize);
   options->enable(Algorithms::FoldSparseToDense);
   options->enable(Algorithms::FusePreActivationBatchNorm);
@@ -45,6 +46,7 @@ TEST(CircleOptimizerTest, optimize_algorithms)
   options->enable(Algorithms::SubstituteStridedSliceToReshape);
   options->enable(Algorithms::SubstituteTransposeToReshape);
   options->enable(Algorithms::ConvertNCHWToNHWC);
+  options->enable(Algorithms::ExpandBroadcastConst);
 
   o.optimize(&g);
 
@@ -78,8 +80,8 @@ TEST(CircleOptimizerTest, quantize_quantdequant_simple)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeDequantizeWeights);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   o.quantize(&g);
@@ -95,8 +97,8 @@ TEST(CircleOptimizerTest, quantize_quantdequant_input_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeDequantizeWeights);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -110,8 +112,8 @@ TEST(CircleOptimizerTest, quantize_quantdequant_output_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeDequantizeWeights);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "invalid");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -125,8 +127,8 @@ TEST(CircleOptimizerTest, quantize_quantdequant_gran_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeDequantizeWeights);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "invalid");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -140,8 +142,8 @@ TEST(CircleOptimizerTest, quantize_minmax_simple)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeWithMinMax);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   o.quantize(&g);
@@ -157,8 +159,8 @@ TEST(CircleOptimizerTest, quantize_minmax_input_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeWithMinMax);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -172,8 +174,8 @@ TEST(CircleOptimizerTest, quantize_minmax_output_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeWithMinMax);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "invalid");
   options->param(AlgorithmParameters::Quantize_granularity, "layer");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -187,8 +189,8 @@ TEST(CircleOptimizerTest, quantize_minmax_gran_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::QuantizeWithMinMax);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
   options->param(AlgorithmParameters::Quantize_granularity, "invalid");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
@@ -202,8 +204,8 @@ TEST(CircleOptimizerTest, quantize_requant_simple)
   auto options = o.options();
 
   options->enable(Algorithms::Requantize);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "int8");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "int8");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
 
   o.quantize(&g);
 
@@ -218,8 +220,8 @@ TEST(CircleOptimizerTest, quantize_requant_input_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::Requantize);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "uint8");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
 }
@@ -232,8 +234,8 @@ TEST(CircleOptimizerTest, quantize_requant_output_NEG)
   auto options = o.options();
 
   options->enable(Algorithms::Requantize);
-  options->param(AlgorithmParameters::Quantize_input_dtype, "int8");
-  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_input_model_dtype, "int8");
+  options->param(AlgorithmParameters::Quantize_output_model_dtype, "invalid");
 
   EXPECT_THROW(o.quantize(&g), std::runtime_error);
 }
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
index 95e23e1b8..270714049 100644
--- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
@@ -28,6 +28,22 @@
 namespace
 {
 
+bool is_same_shape(const luci::CircleNode *node, const std::vector<loco::Dimension> &shape)
+{
+  if (not node)
+    return false;
+
+  if (shape.size() != node->rank())
+    return false;
+
+  for (uint32_t i = 0; i < shape.size(); i++)
+  {
+    if (not(node->dim(i) == shape[i]))
+      return false;
+  }
+  return true;
+}
+
 enum class DataFormat
 {
   NCHW,
@@ -465,7 +481,7 @@ bool is_NCHW_with_s_const(const T *node, luci::CircleNode *&pred_node,
 //
 // Find MUL with an NCHW pattern described below
 //   - Input (non-constant) shape : [N, C, H, W]
-//   - Input (constant) shape : [1, C, 1, 1] or a scalar (1)
+//   - Input (constant) shape : [1, C, 1, 1], [N, C, H, W] or a scalar (1)
 //   - Output shape : [N, C, H, W]
 bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_node,
                         luci::CircleConst *&multiplier)
@@ -497,26 +513,22 @@ bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_nod
   if (const_rank != 4 && const_rank != 0 && const_rank != 1)
     return false;
 
-  if (const_rank == 4)
-  {
-    for (uint32_t i = 0; i < const_rank; i++)
-    {
-      if (i != 1 && multiplier->dim(i).value() != 1)
-        return false;
-    }
-  }
-
   const auto input_cdim = pred_node->dim(1);
   const auto output_cdim = node->dim(1);
 
   if (const_rank == 4)
   {
-    const auto const_cdim = multiplier->dim(1);
-    // Check Input, Output, Const have the same channel size
-    if (const_cdim == input_cdim && input_cdim == output_cdim)
-      return true;
-    else
-      return false;
+    bool supported_shape = false;
+
+    // Check multiplier is (1, C, 1, 1)
+    if (is_same_shape(multiplier, {1, node->dim(1), 1, 1}))
+      supported_shape = true;
+
+    // Check multiplier is (N, C, H, W)
+    if (is_same_shape(multiplier, {node->dim(0), node->dim(1), node->dim(2), node->dim(3)}))
+      supported_shape = true;
+
+    return supported_shape;
   }
   if (input_cdim == output_cdim)
     return true;
@@ -527,7 +539,7 @@ bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_nod
 // We assume ADD with const input is NCHW if,
 // Input shape: (N, C, H, W)
 // Output shape: (N, C, H, W)
-// 1. Const shape is (1, C, 1, 1) or a scalar (1)
+// 1. Const shape is (1, C, 1, 1), (N, C, H, W) or a scalar (1)
 // 2. Input, Output, Const have the same C.
 bool is_NCHW_with_const(const luci::CircleAdd *node, luci::CircleNode *&pred_node,
                         luci::CircleConst *&beta)
@@ -559,30 +571,22 @@ bool is_NCHW_with_const(const luci::CircleAdd *node, luci::CircleNode *&pred_nod
   if (const_rank != 4 && const_rank != 0 && const_rank != 1)
     return false;
 
-  if (const_rank == 4)
-  {
-    // Check the shape is (1, C, 1, 1)
-    for (uint32_t i = 0; i < const_rank; i++)
-    {
-      if (i == 1)
-        continue;
-
-      if (beta->dim(i).value() != 1)
-        return false;
-    }
-  }
-
   const auto input_cdim = pred_node->dim(1);
   const auto output_cdim = node->dim(1);
 
   if (const_rank == 4)
   {
-    const auto const_cdim = beta->dim(1);
-    // Check Input, Output, Const have the same channel size
-    if (const_cdim == input_cdim && input_cdim == output_cdim)
-      return true;
-    else
-      return false;
+    bool supported_shape = false;
+
+    // Check beta is (1, C, 1, 1)
+    if (is_same_shape(beta, {1, node->dim(1), 1, 1}))
+      supported_shape = true;
+
+    // Check beta is (N, C, H, W)
+    if (is_same_shape(beta, {node->dim(0), node->dim(1), node->dim(2), node->dim(3)}))
+      supported_shape = true;
+
+    return supported_shape;
   }
   if (input_cdim == output_cdim)
     return true;
@@ -593,7 +597,7 @@ bool is_NCHW_with_const(const luci::CircleAdd *node, luci::CircleNode *&pred_nod
 // We assume SUB with const input is NCHW if,
 // Input shape: (N, C, H, W)
 // Output shape: (N, C, H, W)
-// 1. Const shape is (1, C, 1, 1) or a scalar (1)
+// 1. Const shape is (1, C, 1, 1), (N, C, H, W) or a scalar (1)
 // 2. Input, Output, Const have the same C.
 bool is_NCHW_with_const(const luci::CircleSub *node, const luci::CircleNode *pred_node,
                         const luci::CircleConst *subtract)
@@ -609,30 +613,22 @@ bool is_NCHW_with_const(const luci::CircleSub *node, const luci::CircleNode *pre
   if (const_rank != 4 && const_rank != 0 && const_rank != 1)
     return false;
 
-  if (const_rank == 4)
-  {
-    // Check the shape is (1, C, 1, 1)
-    for (uint32_t i = 0; i < const_rank; i++)
-    {
-      if (i == 1)
-        continue;
-
-      if (subtract->dim(i).value() != 1)
-        return false;
-    }
-  }
-
   const auto input_cdim = pred_node->dim(1);
   const auto output_cdim = node->dim(1);
 
   if (const_rank == 4)
   {
-    const auto const_cdim = subtract->dim(1);
-    // Check Input, Output, Const have the same channel size
-    if (const_cdim == input_cdim && input_cdim == output_cdim)
-      return true;
-    else
-      return false;
+    bool supported_shape = false;
+
+    // Check subtract is (1, C, 1, 1)
+    if (is_same_shape(subtract, {1, node->dim(1), 1, 1}))
+      supported_shape = true;
+
+    // Check subtract is (N, C, H, W)
+    if (is_same_shape(subtract, {node->dim(0), node->dim(1), node->dim(2), node->dim(3)}))
+      supported_shape = true;
+
+    return supported_shape;
   }
   if (input_cdim == output_cdim)
     return true;
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
index d844246f8..c9412fbb1 100644
--- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
@@ -130,6 +130,19 @@ protected:
   }
 
 public:
+  void update_const_shape_to_nchw(void)
+  {
+    uint32_t channel_size = 16;
+    beta->shape({1, channel_size, 4, 4});
+
+    beta->size<loco::DataType::FLOAT32>(channel_size * 4 * 4);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      beta->at<loco::DataType::FLOAT32>(i) = i;
+    }
+  }
+
+public:
   luci::CircleAdd *add = nullptr;
   luci::CircleConst *beta = nullptr;
 };
@@ -421,6 +434,19 @@ protected:
   }
 
 public:
+  void update_const_shape_to_nchw(void)
+  {
+    uint32_t channel_size = 16;
+    multiplier->shape({1, channel_size, 4, 4});
+
+    multiplier->size<loco::DataType::FLOAT32>(channel_size * 4 * 4);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      multiplier->at<loco::DataType::FLOAT32>(i) = i;
+    }
+  }
+
+public:
   luci::CircleMul *mul = nullptr;
   luci::CircleConst *multiplier = nullptr;
 };
@@ -696,6 +722,19 @@ protected:
   }
 
 public:
+  void update_const_shape_to_nchw(void)
+  {
+    uint32_t channel_size = 16;
+    beta->shape({1, channel_size, 4, 4});
+
+    beta->size<loco::DataType::FLOAT32>(channel_size * 4 * 4);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      beta->at<loco::DataType::FLOAT32>(i) = i;
+    }
+  }
+
+public:
   luci::CircleSub *sub = nullptr;
   luci::CircleConst *beta = nullptr;
 };
@@ -815,6 +854,30 @@ TEST(ConvertNCHWToNHWC, Add)
   check_pre_trans(g.output->from());
 }
 
+TEST(ConvertNCHWToNHWC, Add_NCHW_const)
+{
+  AddGraph g;
+  g.init();
+  g.update_const_shape_to_nchw();
+
+  run_phase(&g.g, false, false);
+
+  check_pre_trans(g.add->x());
+
+  auto add_succs = loco::succs(g.add);
+  EXPECT_EQ(1, add_succs.size());
+  check_post_trans(*add_succs.begin());
+
+  uint32_t channel_size = 16;
+  auto new_beta = dynamic_cast<luci::CircleConst *>(g.add->y());
+  EXPECT_NE(nullptr, new_beta);
+  EXPECT_EQ(4, new_beta->rank());
+  EXPECT_EQ(1, new_beta->dim(0).value());
+  EXPECT_EQ(4, new_beta->dim(1).value());
+  EXPECT_EQ(4, new_beta->dim(2).value());
+  EXPECT_EQ(channel_size, new_beta->dim(3).value());
+}
+
 TEST(ConvertNCHWToNHWC, NHWC_Relu)
 {
   // Relu is already NHWC, so it should not be converted
@@ -1123,6 +1186,30 @@ TEST(ConvertNCHWToNHWC, Mul)
   check_pre_trans(g.output->from());
 }
 
+TEST(ConvertNCHWToNHWC, Mul_NCHW_const)
+{
+  MulGraph g;
+  g.init();
+  g.update_const_shape_to_nchw();
+
+  run_phase(&g.g, false, false);
+
+  check_pre_trans(g.mul->x());
+
+  auto mul_succs = loco::succs(g.mul);
+  EXPECT_EQ(1, mul_succs.size());
+  check_post_trans(*mul_succs.begin());
+
+  uint32_t channel_size = 16;
+  auto new_multiplier = dynamic_cast<luci::CircleConst *>(g.mul->y());
+  EXPECT_NE(nullptr, new_multiplier);
+  EXPECT_EQ(4, new_multiplier->rank());
+  EXPECT_EQ(1, new_multiplier->dim(0).value());
+  EXPECT_EQ(4, new_multiplier->dim(1).value());
+  EXPECT_EQ(4, new_multiplier->dim(2).value());
+  EXPECT_EQ(channel_size, new_multiplier->dim(3).value());
+}
+
 TEST(ConvertNCHWToNHWC, MulScalar)
 {
   MulScalarGraph g;
@@ -1432,6 +1519,30 @@ TEST(ConvertNCHWToNHWC, Sub)
   check_pre_trans(g.output->from());
 }
 
+TEST(ConvertNCHWToNHWC, Sub_NCHW_const)
+{
+  SubGraph g;
+  g.init();
+  g.update_const_shape_to_nchw();
+
+  run_phase(&g.g, false, false);
+
+  check_pre_trans(g.sub->x());
+
+  auto sub_succs = loco::succs(g.sub);
+  EXPECT_EQ(1, sub_succs.size());
+  check_post_trans(*sub_succs.begin());
+
+  uint32_t channel_size = 16;
+  auto new_beta = dynamic_cast<luci::CircleConst *>(g.sub->y());
+  EXPECT_NE(nullptr, new_beta);
+  EXPECT_EQ(4, new_beta->rank());
+  EXPECT_EQ(1, new_beta->dim(0).value());
+  EXPECT_EQ(4, new_beta->dim(1).value());
+  EXPECT_EQ(4, new_beta->dim(2).value());
+  EXPECT_EQ(channel_size, new_beta->dim(3).value());
+}
+
 TEST(ConvertNCHWToNHWC, SubScalar)
 {
   SubScalarGraph g;
diff --git a/compiler/luci/pass/src/ExpandBroadcastConstPass.cpp b/compiler/luci/pass/src/ExpandBroadcastConstPass.cpp
new file mode 100644
index 000000000..25fb9f171
--- /dev/null
+++ b/compiler/luci/pass/src/ExpandBroadcastConstPass.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ExpandBroadcastConstPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Log.h>
+
+#include <type_traits>
+
+namespace
+{
+
+luci::CircleConst *create_expanded_constant(luci::CircleConst *node, luci::CircleNode *successor)
+{
+  LOGGER(l);
+
+  if (successor->rank() != node->rank())
+    return nullptr;
+
+  std::vector<uint32_t> broadcast_dims;
+  for (uint32_t dim = 0; dim < node->rank(); ++dim)
+  {
+    if (node->dim(dim) == successor->dim(dim))
+      continue;
+
+    if (node->dim(dim) == 1)
+      broadcast_dims.push_back(dim);
+  }
+
+  if (broadcast_dims.size() != 1 || broadcast_dims.back() != node->rank() - 1)
+  {
+    WARN(l) << "NYI: Only depth broadcast removal is supported";
+    return nullptr;
+  }
+
+  auto constant = node->graph()->nodes()->create<luci::CircleConst>();
+  constant->name(node->name());
+  constant->dtype(node->dtype());
+  constant->rank(node->rank());
+  constant->shape_status(luci::ShapeStatus::VALID);
+
+  uint32_t node_size = node->size<loco::DataType::FLOAT32>();
+  uint32_t constant_size = 1;
+  for (uint32_t i = 0; i < successor->rank(); ++i)
+  {
+    constant->dim(i).set(successor->dim(i).value());
+    constant_size *= constant->dim(i).value();
+  }
+  constant->size<loco::DataType::FLOAT32>(constant_size);
+
+  auto const node_data = &node->at<loco::DataType::FLOAT32>(0);
+  auto const constant_data = &constant->at<loco::DataType::FLOAT32>(0);
+
+  auto const successor_depth = successor->dim(successor->rank() - 1).value();
+  for (uint32_t d = 0; d < successor_depth; ++d)
+    std::copy(node_data, node_data + node_size, constant_data + d * node_size);
+
+  return constant;
+}
+
+template <typename N> bool expand_node_input(luci::CircleConst *node, luci::CircleNode *successor)
+{
+  static_assert(std::is_base_of<luci::CircleNode, N>::value,
+                "Successor node should have CircleNode base");
+
+  auto const successor_node = loco::must_cast<N *>(successor);
+  auto const successor_x = loco::must_cast<luci::CircleNode *>(successor_node->x());
+  auto const successor_y = loco::must_cast<luci::CircleNode *>(successor_node->y());
+
+  luci::CircleConst *expanded_const;
+
+  if (node == successor_x)
+  {
+    expanded_const = create_expanded_constant(node, successor_y);
+
+    if (expanded_const == nullptr)
+      return false;
+
+    successor_node->x(expanded_const);
+  }
+  else if (node == successor_y)
+  {
+    expanded_const = create_expanded_constant(node, successor_x);
+
+    if (expanded_const == nullptr)
+      return false;
+
+    successor_node->y(expanded_const);
+  }
+
+  return true;
+}
+
+/**
+ * Expand constants following broadcasting rules for binary input nodes (Add, Mul, etc.)
+ *
+ *    BEFORE
+ *
+ *    [CircleInput] [CircleConst (H x W x 1)]
+ *               |     |
+ *             [CircleAdd]
+ *
+ *    AFTER
+ *
+ *    [CircleInput] [CircleConst (H x W x D)]
+ *               |     |
+ *             [CircleAdd]
+ */
+bool expand_broadcast_const(luci::CircleConst *node)
+{
+  if (node->dtype() != loco::DataType::FLOAT32)
+    return false; // Unsupported data type
+
+  bool changed = false;
+
+  for (auto successor : loco::succs(node))
+  {
+    auto const circle_successor = loco::must_cast<luci::CircleNode *>(successor);
+    switch (circle_successor->opcode())
+    {
+      case luci::CircleOpcode::ADD:
+        if (expand_node_input<luci::CircleAdd>(node, circle_successor))
+          changed = true;
+        break;
+      case luci::CircleOpcode::MUL:
+        if (expand_node_input<luci::CircleMul>(node, circle_successor))
+          changed = true;
+        break;
+      case luci::CircleOpcode::DIV:
+        if (expand_node_input<luci::CircleDiv>(node, circle_successor))
+          changed = true;
+        break;
+      default:
+        break; // Unsupported successor node
+    }
+  }
+
+  return changed;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * Broadcast expanding for Const nodes
+ **/
+bool ExpandBroadcastConstPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto const_node = dynamic_cast<luci::CircleConst *>(node);
+    if (const_node == nullptr)
+      continue;
+
+    if (expand_broadcast_const(const_node))
+      changed = true;
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ExpandBroadcastConstPass.test.cpp b/compiler/luci/pass/src/ExpandBroadcastConstPass.test.cpp
new file mode 100644
index 000000000..0734e0778
--- /dev/null
+++ b/compiler/luci/pass/src/ExpandBroadcastConstPass.test.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ExpandBroadcastConstPass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+class ExpandBroadcastConstTest : public ::testing::Test
+{
+public:
+  ExpandBroadcastConstTest()
+  {
+    _x = _g.nodes()->create<luci::CircleInput>();
+    _y = _g.nodes()->create<luci::CircleConst>();
+    _add = _g.nodes()->create<luci::CircleAdd>();
+    _output = _g.nodes()->create<luci::CircleOutput>();
+
+    auto graph_input = _g.inputs()->create();
+    graph_input->dtype(loco::DataType::FLOAT32);
+    graph_input->shape({1, H, W, D});
+    _x->index(graph_input->index());
+    _x->dtype(graph_input->dtype());
+    _x->shape({1, H, W, D});
+
+    auto graph_output = _g.outputs()->create();
+    graph_output->dtype(loco::DataType::FLOAT32);
+    graph_output->shape({1, H, W, D});
+    _output->index(graph_output->index());
+    _output->dtype(graph_output->dtype());
+    _output->shape({1, H, W, D});
+
+    _y->dtype(loco::DataType::FLOAT32);
+    _y->shape({1, H, W, 1});
+    _y->size<loco::DataType::FLOAT32>(16);
+
+    _add->dtype(loco::DataType::FLOAT32);
+    _add->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _add->x(_x);
+    _add->y(_y);
+    _add->shape({1, H, W, D});
+
+    _output->from(_add);
+
+    _x->name("input");
+    _output->name("output");
+  }
+
+protected:
+  uint32_t const H = 4;
+  uint32_t const W = 4;
+  uint32_t const D = 3;
+
+protected:
+  loco::Graph _g;
+  luci::CircleAdd *_add = nullptr;
+  luci::CircleInput *_x = nullptr;
+  luci::CircleConst *_y = nullptr;
+  luci::CircleOutput *_output = nullptr;
+};
+
+} // namespace
+
+TEST_F(ExpandBroadcastConstTest, name)
+{
+  luci::ExpandBroadcastConstPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(ExpandBroadcastConstTest, remove_broadcast)
+{
+  for (uint32_t i = 0; i < H * W; ++i)
+    _y->at<loco::DataType::FLOAT32>(i) = static_cast<float>(i);
+
+  luci::ExpandBroadcastConstPass pass;
+  ASSERT_TRUE(pass.run(&_g));
+
+  auto broadcasted_const = dynamic_cast<luci::CircleConst *>(_add->y());
+  ASSERT_NE(broadcasted_const, nullptr);
+
+  EXPECT_EQ(broadcasted_const->dtype(), loco::DataType::FLOAT32);
+  EXPECT_EQ(broadcasted_const->dim(1).value(), H);
+  EXPECT_EQ(broadcasted_const->dim(2).value(), W);
+  EXPECT_EQ(broadcasted_const->dim(3).value(), D);
+  EXPECT_EQ(broadcasted_const->size<loco::DataType::FLOAT32>(), H * W * D);
+
+  for (uint32_t i = 0; i < H * W; ++i)
+  {
+    for (uint32_t d = 0; d < D; ++d)
+    {
+      EXPECT_NEAR(broadcasted_const->at<loco::DataType::FLOAT32>(i + H * W * d),
+                  static_cast<float>(i), std::numeric_limits<float>::min());
+    }
+  }
+}
+
+TEST_F(ExpandBroadcastConstTest, remove_broadcast_multiple_successors)
+{
+  auto const circle_sqrt = _g.nodes()->create<luci::CircleSqrt>();
+  circle_sqrt->dtype(loco::DataType::FLOAT32);
+  circle_sqrt->shape({1, H, W, 1});
+  circle_sqrt->x(_y);
+
+  luci::ExpandBroadcastConstPass pass;
+  ASSERT_TRUE(pass.run(&_g));
+
+  auto broadcasted_const = dynamic_cast<luci::CircleConst *>(_add->y());
+  auto original_const = dynamic_cast<luci::CircleConst *>(circle_sqrt->x());
+
+  ASSERT_NE(broadcasted_const, nullptr);
+  EXPECT_EQ(broadcasted_const->dtype(), loco::DataType::FLOAT32);
+  EXPECT_EQ(broadcasted_const->dim(3).value(), D);
+  EXPECT_EQ(broadcasted_const->size<loco::DataType::FLOAT32>(), H * W * D);
+
+  // Check if another successor's node was left intact
+  ASSERT_NE(original_const, nullptr);
+  EXPECT_EQ(original_const->dtype(), loco::DataType::FLOAT32);
+  EXPECT_EQ(original_const->dim(3).value(), 1);
+  EXPECT_EQ(original_const->size<loco::DataType::FLOAT32>(), H * W * 1);
+}
+
+TEST_F(ExpandBroadcastConstTest, broadcast_impossible_NEG)
+{
+  _y->shape({1, H, W, 2});
+  _y->size<loco::DataType::FLOAT32>(H * W * (D - 1));
+
+  luci::ExpandBroadcastConstPass pass;
+  ASSERT_FALSE(pass.run(&_g));
+}
diff --git a/compiler/luci/pass/src/FoldDepthwiseConv2DPass.cpp b/compiler/luci/pass/src/FoldDepthwiseConv2DPass.cpp
new file mode 100644
index 000000000..6e423e3d9
--- /dev/null
+++ b/compiler/luci/pass/src/FoldDepthwiseConv2DPass.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldDepthwiseConv2DPass.h"
+
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/AttrFusedActFunc.h>
+
+#include <luci/Log.h>
+
+namespace
+{
+
+// TODO Share activation mix/max and compute_input/output code with luci-interpreter
+
+bool compute_output(uint32_t *output_size, luci::Padding padding, int32_t image_size,
+                    int32_t filter_size, int32_t stride, int32_t dilation_rate)
+{
+  auto const effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  switch (padding)
+  {
+    case luci::Padding::SAME:
+      *output_size = (image_size + stride - 1) / stride;
+      return true;
+
+    case luci::Padding::VALID:
+      *output_size = (image_size + stride - effective_filter_size) / stride;
+      return true;
+
+    default:
+    {
+      LOGGER(l);
+      WARN(l) << "Unsupported padding: " << uint32_t(padding);
+      return false;
+    }
+  }
+}
+
+uint32_t compute_padding(int32_t stride, int32_t dilation_rate, int32_t in_size,
+                         int32_t filter_size, int32_t out_size)
+{
+  auto const effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  auto const padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
+  return padding > 0 ? padding : 0;
+}
+
+bool set_kernel_parameters(tflite::DepthwiseParams *params, luci::CircleDepthwiseConv2D *node,
+                           uint32_t padding_height, uint32_t padding_width)
+{
+  switch (node->fusedActivationFunction())
+  {
+    case luci::FusedActFunc::NONE:
+    case luci::FusedActFunc::TANH:
+      params->float_activation_min = std::numeric_limits<float>::lowest();
+      params->float_activation_max = std::numeric_limits<float>::max();
+      break;
+    case luci::FusedActFunc::RELU:
+      params->float_activation_min = 0;
+      params->float_activation_max = std::numeric_limits<float>::max();
+      break;
+    case luci::FusedActFunc::RELU_N1_TO_1:
+      params->float_activation_min = -1;
+      params->float_activation_max = 1;
+      break;
+    case luci::FusedActFunc::RELU6:
+      params->float_activation_min = 0;
+      params->float_activation_max = 6;
+      break;
+    default:
+    {
+      LOGGER(l);
+      WARN(l) << "Unsupported activation: " << uint32_t(node->fusedActivationFunction());
+      return false;
+    }
+  }
+
+  params->stride_height = node->stride()->h();
+  params->stride_width = node->stride()->w();
+  params->dilation_height_factor = node->dilation()->h();
+  params->dilation_width_factor = node->dilation()->w();
+  params->depth_multiplier = node->depthMultiplier();
+
+  params->padding_values.height = padding_height;
+  params->padding_values.width = padding_width;
+
+  return true;
+}
+
+/**
+ * Fold DepthwiseConv2D with constant input and filter into a constant tensor
+ *
+ *    BEFORE
+ *
+ *    [CircleConst] [CircleConst]
+ *               |   |
+ *       [CircleDepthwiseConv2D]
+ *
+ *    AFTER
+ *
+ *           [CircleConst]
+ */
+bool fold_depthwise_conv_2d(luci::CircleDepthwiseConv2D *node)
+{
+  LOGGER(l);
+
+  auto const input = dynamic_cast<luci::CircleConst *>(node->input());
+
+  if (input == nullptr)
+    return false; // Constant input is required for folding
+
+  auto const filter = dynamic_cast<luci::CircleConst *>(node->filter());
+
+  if (filter == nullptr)
+    return false; // Constant filter is required for folding
+
+  if (filter->dim(0).value() != 1)
+    return false; // Unsupported batch size
+
+  auto const bias = dynamic_cast<luci::CircleConst *>(node->bias());
+
+  if (bias == nullptr)
+    return false; // Constant bias is required for folding
+
+  auto const input_batches = input->dim(0).value();
+  auto const input_height = input->dim(1).value();
+  auto const input_width = input->dim(2).value();
+  auto const input_depth = input->dim(3).value();
+
+  auto const filter_height = filter->dim(1).value();
+  auto const filter_width = filter->dim(2).value();
+  auto const filter_channels_out = filter->dim(3).value();
+
+  if (filter_channels_out % input_depth != 0)
+    return false; // Wrong input/output depth ratio
+
+  if (node->depthMultiplier() != static_cast<int32_t>(filter_channels_out / input_depth))
+    return false; // Wrong depth multiplier value
+
+  if (bias->rank() != 1 || bias->dim(0).value() != filter_channels_out)
+    return false; // Unsupported bias value
+
+  uint32_t output_height = 0;
+  uint32_t output_width = 0;
+
+  if (!compute_output(&output_height, node->padding(), input_height, filter_height,
+                      node->stride()->h(), node->dilation()->h()))
+    return false; // Unsupported output parameters
+
+  if (!compute_output(&output_width, node->padding(), input_width, filter_width,
+                      node->stride()->w(), node->dilation()->w()))
+    return false; // Unsupported output parameters
+
+  auto const padding_height = compute_padding(node->stride()->h(), node->dilation()->h(),
+                                              input_height, filter_height, output_height);
+  auto const padding_width = compute_padding(node->stride()->w(), node->dilation()->w(),
+                                             input_width, filter_width, output_width);
+
+  tflite::DepthwiseParams params{};
+
+  if (!set_kernel_parameters(&params, node, padding_height, padding_width))
+    return false; // Unsupported kernel parameter values
+
+  auto constant = node->graph()->nodes()->create<luci::CircleConst>();
+  constant->name(node->name());
+  constant->dtype(node->dtype());
+  constant->rank(node->rank());
+  constant->shape_status(luci::ShapeStatus::VALID);
+  for (uint32_t i = 0; i < node->rank(); ++i)
+    constant->dim(i).set(node->dim(i).value());
+
+  constant->size<loco::DataType::FLOAT32>(input_batches * output_height * output_width *
+                                          filter_channels_out);
+
+  auto const input_data = &input->at<loco::DataType::FLOAT32>(0);
+  auto const filter_data = &filter->at<loco::DataType::FLOAT32>(0);
+  auto const bias_data = &bias->at<loco::DataType::FLOAT32>(0);
+  auto const constant_data = &constant->at<loco::DataType::FLOAT32>(0);
+
+  auto tensor_shape = [](luci::CircleNode *node) {
+    tflite::RuntimeShape runtime_shape(node->rank());
+    for (uint32_t i = 0; i < node->rank(); ++i)
+      runtime_shape.SetDim(i, node->dim(i).value());
+    return runtime_shape;
+  };
+
+  tflite::reference_ops::DepthwiseConv(params, tensor_shape(input), input_data,
+                                       tensor_shape(filter), filter_data, tensor_shape(bias),
+                                       bias_data, tensor_shape(constant), constant_data);
+
+  loco::replace(node).with(constant);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * Constant Folding for DepthwiseConv2D Op
+ **/
+bool FoldDepthwiseConv2DPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto depthwise_conv2d = dynamic_cast<CircleDepthwiseConv2D *>(node);
+
+    if (depthwise_conv2d == nullptr)
+      continue;
+
+    switch (depthwise_conv2d->dtype())
+    {
+      case loco::DataType::FLOAT32:
+        changed = fold_depthwise_conv_2d(depthwise_conv2d);
+        break;
+      default:
+        break;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FoldDepthwiseConv2DPass.test.cpp b/compiler/luci/pass/src/FoldDepthwiseConv2DPass.test.cpp
new file mode 100644
index 000000000..b1ef56833
--- /dev/null
+++ b/compiler/luci/pass/src/FoldDepthwiseConv2DPass.test.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldDepthwiseConv2DPass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Graph has an DepthwiseConv2D Op with constant inputs
+ *
+ *    BEFORE
+ *
+ *    [CircleConst] [CircleConst]
+ *               |   |
+ *       [CircleDepthwiseConv2D]
+ *
+ *    AFTER
+ *
+ *           [CircleConst]
+ */
+class FoldDepthwiseConv2DTest : public luci::ConstantFoldingTestGraph, public ::testing::Test
+{
+public:
+  FoldDepthwiseConv2DTest() : luci::ConstantFoldingTestGraph({1, 4, 4, 1}, loco::DataType::FLOAT32)
+  {
+    _dconv = _g.nodes()->create<luci::CircleDepthwiseConv2D>();
+    _dconv_input = _g.nodes()->create<luci::CircleConst>();
+    _dconv_filter = _g.nodes()->create<luci::CircleConst>();
+    _dconv_bias = _g.nodes()->create<luci::CircleConst>();
+
+    _dconv->dtype(loco::DataType::FLOAT32);
+    _dconv->padding(luci::Padding::VALID);
+    _dconv->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _dconv->input(_dconv_input);
+    _dconv->filter(_dconv_filter);
+    _dconv->bias(_dconv_bias);
+    _dconv->shape({1, 4, 4, 1});
+    _dconv->stride()->h(1);
+    _dconv->stride()->w(1);
+    _dconv->depthMultiplier(1);
+
+    _dconv_input->dtype(loco::DataType::FLOAT32);
+    _dconv_input->shape({1, 4, 4, 1});
+    _dconv_input->size<loco::DataType::FLOAT32>(16);
+
+    _dconv_filter->dtype(loco::DataType::FLOAT32);
+    _dconv_filter->shape({1, 1, 1, 1});
+    _dconv_filter->size<loco::DataType::FLOAT32>(1);
+
+    _dconv_bias->dtype(loco::DataType::FLOAT32);
+    _dconv_bias->shape({1});
+    _dconv_bias->size<loco::DataType::FLOAT32>(1);
+
+    _output->from(_dconv);
+  }
+
+protected:
+  void init() final {}
+
+protected:
+  loco::Node *createFoldedPattern() final { return nullptr; }
+
+protected:
+  luci::CircleConst *getFoldedPattern() final
+  {
+    return loco::must_cast<luci::CircleConst *>(_output->from());
+  }
+
+protected:
+  luci::CircleDepthwiseConv2D *_dconv = nullptr;
+  luci::CircleConst *_dconv_input = nullptr;
+  luci::CircleConst *_dconv_filter = nullptr;
+  luci::CircleConst *_dconv_bias = nullptr;
+};
+
+} // namespace
+
+TEST(FoldDepthwiseConv2DPass, name)
+{
+  luci::FoldDepthwiseConv2DPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(FoldDepthwiseConv2DTest, fold_depthwise_conv2d)
+{
+  for (uint32_t i = 0; i < 16; ++i)
+    _dconv_input->at<loco::DataType::FLOAT32>(i) = 0.5;
+  _dconv_filter->at<loco::DataType::FLOAT32>(0) = 0.5;
+
+  luci::FoldDepthwiseConv2DPass pass;
+  ASSERT_TRUE(pass.run(&_g));
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_EQ(folded_const->dtype(), loco::DataType::FLOAT32);
+  EXPECT_NEAR(folded_const->at<loco::DataType::FLOAT32>(0), 0.25,
+              std::numeric_limits<float>::min());
+  EXPECT_NEAR(folded_const->at<loco::DataType::FLOAT32>(15), 0.25,
+              std::numeric_limits<float>::min());
+}
+
+TEST_F(FoldDepthwiseConv2DTest, fold_non_constant_NEG)
+{
+  _dconv->input(_input);
+
+  luci::FoldDepthwiseConv2DPass pass;
+  ASSERT_FALSE(pass.run(&_g));
+}
diff --git a/compiler/luci/pass/src/ForceQuantParamPass.cpp b/compiler/luci/pass/src/ForceQuantParamPass.cpp
new file mode 100644
index 000000000..32d482fc1
--- /dev/null
+++ b/compiler/luci/pass/src/ForceQuantParamPass.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ForceQuantParamPass.h"
+#include "luci/Profile/CircleNodeID.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Log.h>
+
+namespace luci
+{
+
+namespace
+{
+
+void set_qparam(luci::CircleNode *node, float scale, int64_t zp)
+{
+  assert(node); // FIX_CALLER_UNLESS
+
+  auto quantparam = std::make_unique<CircleQuantParam>();
+  quantparam->scale.push_back(scale);
+  quantparam->zerop.push_back(zp);
+
+  node->quantparam(std::move(quantparam));
+}
+
+} // namespace
+
+bool ForceQuantParamPass::run(loco::Graph *g)
+{
+  LOGGER(l);
+  INFO(l) << "ForceQuantParamPass Start" << std::endl;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto const cnode = loco::must_cast<CircleNode *>(node);
+    auto const name = cnode->name();
+    auto target = std::find(_tensors.begin(), _tensors.end(), name);
+    if (target == _tensors.end())
+      continue;
+
+    auto index = target - _tensors.begin();
+    auto scale = _scales[index];
+    auto zp = _zerops[index];
+    set_qparam(cnode, scale, zp);
+
+    _tensors.erase(_tensors.begin() + index);
+    _scales.erase(_scales.begin() + index);
+    _zerops.erase(_zerops.begin() + index);
+  }
+
+  if (_tensors.size() > 0)
+  {
+    std::string msg;
+    for (auto const &t : _tensors)
+      msg += "Tensor does not exist: " + t + ".\n";
+    msg += "Please check tensor name.\n";
+    throw std::runtime_error(msg);
+  }
+
+  INFO(l) << "ForceQuantParamPass End" << std::endl;
+  return false; // one time run
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ForceQuantParamPass.test.cpp b/compiler/luci/pass/src/ForceQuantParamPass.test.cpp
new file mode 100644
index 000000000..a9da7c25e
--- /dev/null
+++ b/compiler/luci/pass/src/ForceQuantParamPass.test.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ForceQuantParamPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using TensorVector = luci::ForceQuantParamPass::TensorVector;
+using ScaleVector = luci::ForceQuantParamPass::ScaleVector;
+using ZPVector = luci::ForceQuantParamPass::ZPVector;
+
+std::unique_ptr<luci::CircleQuantParam> make_qparam(float scale, int64_t zp)
+{
+  auto qparam = std::make_unique<luci::CircleQuantParam>();
+  qparam->scale.push_back(scale);
+  qparam->zerop.push_back(zp);
+
+  return std::move(qparam);
+}
+
+bool check_per_tensor_qparam(luci::CircleNode *node, float scale, int64_t zp)
+{
+  assert(node); // FIX_CALLER_UNLESS
+
+  auto qparam = node->quantparam();
+  if (qparam->scale.size() != 1)
+    return false;
+
+  if (qparam->scale[0] != scale)
+    return false;
+
+  if (qparam->zerop.size() != 1)
+    return false;
+
+  if (qparam->zerop[0] != zp)
+    return false;
+
+  return true;
+}
+
+/**
+ *  Graph with a single input and a single output.
+ *
+ *             [Input]
+ *                |
+ *           (graph body) -> implemented by insertGraphBody()
+ *                |
+ *             [Output]
+ *
+ */
+class SISOGraph
+{
+public:
+  SISOGraph() = default;
+
+public:
+  void init()
+  {
+    input = g.nodes()->create<luci::CircleInput>();
+    output = g.nodes()->create<luci::CircleOutput>();
+    input->name("input");
+    output->name("output");
+
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+
+    graph_input->dtype(loco::DataType::U8);
+    input->dtype(loco::DataType::U8);
+    output->dtype(loco::DataType::U8);
+    graph_output->dtype(loco::DataType::U8);
+
+    input->quantparam(make_qparam(0.1, 11));
+    output->quantparam(make_qparam(0.2, 12));
+
+    uint32_t channel_size = 16;
+    graph_input->shape({1, channel_size, 4, 4});
+    input->shape({1, channel_size, 4, 4});
+    output->shape({1, channel_size, 4, 4});
+    graph_output->shape({1, channel_size, 4, 4});
+
+    auto graph_body = insertGraphBody(input);
+    output->from(graph_body);
+  }
+
+  virtual ~SISOGraph() = default;
+
+protected:
+  virtual loco::Node *insertGraphBody(loco::Node *input) = 0;
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+class AddGraph final : public SISOGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    add = g.nodes()->create<luci::CircleAdd>();
+    beta = g.nodes()->create<luci::CircleConst>();
+
+    add->dtype(loco::DataType::U8);
+    beta->dtype(loco::DataType::U8);
+    add->quantparam(make_qparam(0.1, 11));
+    beta->quantparam(make_qparam(0.2, 12));
+
+    uint32_t channel_size = 16;
+    add->shape({1, 4, 4, channel_size});
+    beta->shape({1, 1, 1, channel_size});
+
+    beta->size<loco::DataType::U8>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      beta->at<loco::DataType::U8>(i) = i;
+    }
+
+    add->x(input);
+    add->y(beta);
+
+    add->name("add");
+    beta->name("beta");
+
+    return add;
+  }
+
+public:
+  luci::CircleAdd *add = nullptr;
+  luci::CircleConst *beta = nullptr;
+};
+
+} // namespace
+
+TEST(ForceQuantParamPassTest, simple)
+{
+  TensorVector tensors{"input", "add"};
+  ScaleVector scales{2.0, 3.0};
+  ZPVector zerops{4, 8};
+
+  luci::ForceQuantParamPass pass(tensors, scales, zerops);
+
+  AddGraph g;
+  g.init();
+
+  pass.run(&g.g);
+
+  EXPECT_TRUE(check_per_tensor_qparam(g.input, 2.0, 4));
+  EXPECT_TRUE(check_per_tensor_qparam(g.add, 3.0, 8));
+}
+
+TEST(ForceQuantParamPassTest, name_mismatch_NEG)
+{
+  TensorVector tensors{"no_exist"};
+  ScaleVector scales{2.0};
+  ZPVector zerops{4};
+
+  luci::ForceQuantParamPass pass(tensors, scales, zerops);
+
+  AddGraph g;
+  g.init();
+
+  EXPECT_THROW(pass.run(&g.g), std::runtime_error);
+}
diff --git a/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp
new file mode 100644
index 000000000..97a962cb6
--- /dev/null
+++ b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseAddWithFullyConnectedPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/Nodes/CircleConst.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+/**
+ *  Fuse Add to FullyConnected if the added value is a channel(last dimension)-wise constant
+ *
+ *  BEFORE
+ *                |
+ *      [CircleFullyConnected]
+ *                |
+ *           [CircleAdd]
+ *                |
+ *
+ *  AFTER
+ *                |
+ *       [CircleFullyConnected]   [CircleAdd] (dead)
+ *                |
+ *
+ */
+bool fuse_add_with_fc(luci::CircleFullyConnected *fc)
+{
+  if (not fc)
+    return false;
+
+  if (fc->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  if (fc->fusedActivationFunction() != luci::FusedActFunc::NONE)
+    return false;
+
+  auto weights = dynamic_cast<luci::CircleConst *>(fc->weights());
+  if (not weights)
+    return false;
+
+  // Get add node
+  auto fc_output = loco::succs(fc);
+  if (fc_output.size() != 1)
+    return false;
+
+  auto add = dynamic_cast<luci::CircleAdd *>(*fc_output.begin());
+  if (not add)
+    return false;
+  if (add->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  // Get addition
+  auto addition = add->x() == fc ? dynamic_cast<luci::CircleConst *>(add->y())
+                                 : dynamic_cast<luci::CircleConst *>(add->x());
+
+  // Non-const addition
+  if (not addition)
+    return false;
+
+  auto rank = addition->rank();
+  // TODO Support scalar addition
+  if (rank == 0)
+    return false;
+
+  for (uint32_t i = 0; i < rank - 1; i++)
+  {
+    if (addition->dim(i).value() != 1)
+      return false;
+  }
+  // Check the last dimesion of addition is the same with the number of neurons of FC
+  if (not(addition->dim(rank - 1) == weights->dim(0)))
+    return false;
+
+  auto fused_bias = luci::clone(addition);
+
+  // Add existing bias values
+  if (auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias()))
+  {
+    assert(const_bias->dtype() == loco::DataType::FLOAT32);
+
+    auto bias_size = fused_bias->size<loco::DataType::FLOAT32>();
+    assert(bias_size == const_bias->size<loco::DataType::FLOAT32>());
+    for (uint32_t i = 0; i < bias_size; i++)
+      fused_bias->at<loco::DataType::FLOAT32>(i) += const_bias->at<loco::DataType::FLOAT32>(i);
+  }
+
+  fc->bias(fused_bias);
+  fc->fusedActivationFunction(add->fusedActivationFunction());
+
+  // set origin
+  luci::add_origin(fc, luci::get_origin(add));
+
+  replace(add).with(fc);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseAddWithFullyConnectedPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    if (fuse_add_with_fc(fc))
+      changed = true;
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.test.cpp
new file mode 100644
index 000000000..4cc2eb599
--- /dev/null
+++ b/compiler/luci/pass/src/FuseAddWithFullyConnectedPass.test.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseAddWithFullyConnectedPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+// TODO Reduce duplicate codes in ResolveCustomOpMatMulPass.cpp
+template <typename T>
+luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
+                                     const std::vector<uint32_t> &shape,
+                                     const std::vector<T> &values)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  node->dtype(dtype);
+  node->rank(shape.size());
+
+  uint32_t size = 1;
+  for (uint32_t i = 0; i < shape.size(); ++i)
+  {
+    node->dim(i) = shape.at(i);
+    size *= shape.at(i);
+  }
+  node->shape_status(luci::ShapeStatus::VALID);
+
+#define INIT_VALUES(DT)                          \
+  {                                              \
+    node->size<DT>(size);                        \
+    for (uint32_t i = 0; i < values.size(); ++i) \
+      node->at<DT>(i) = values[i];               \
+  }
+
+  switch (dtype)
+  {
+    case loco::DataType::U8:
+      INIT_VALUES(loco::DataType::U8);
+      break;
+    case loco::DataType::S16:
+      INIT_VALUES(loco::DataType::S16);
+      break;
+    case loco::DataType::S32:
+      INIT_VALUES(loco::DataType::S32);
+      break;
+    case loco::DataType::FLOAT32:
+      INIT_VALUES(loco::DataType::FLOAT32)
+      break;
+    default:
+      INTERNAL_EXN("create_const_node called with unsupported type");
+      break;
+  }
+  return node;
+}
+
+/**
+ *  Simple graph for test
+ *
+ *  BEFORE
+ *
+ *         [FC]
+ *           |
+ *     [Add w/ Relu]
+ *
+ *  AFTER
+ *
+ *      [FC w/ Relu] (bias updated)
+ *
+ */
+class FCAddGraphlet
+{
+public:
+  FCAddGraphlet() = default;
+
+  void init(loco::Graph *g)
+  {
+    std::vector<float> weights_val(16 * 4);
+    _fc_f = create_const_node(g, loco::DataType::FLOAT32, {16, 4}, weights_val);
+
+    std::vector<float> bias_val(16);
+    _fc_b = create_const_node(g, loco::DataType::FLOAT32, {1, 16}, bias_val);
+
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->weights(_fc_f);
+    _fc->bias(_fc_b);
+    _fc->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _fc->dtype(loco::DataType::FLOAT32);
+    _fc->shape({1, 16});
+    _fc->name("fc");
+
+    std::vector<float> addition_val;
+    for (uint32_t i = 0; i < 16; i++)
+      addition_val.push_back(static_cast<float>(i));
+    _add_c = create_const_node(g, loco::DataType::FLOAT32, {1, 16}, addition_val);
+
+    _add = g->nodes()->create<luci::CircleAdd>();
+    _add->x(_fc);
+    _add->y(_add_c);
+    _add->fusedActivationFunction(luci::FusedActFunc::RELU);
+    _add->dtype(loco::DataType::FLOAT32);
+    _add->shape({1, 16});
+    _add->name("add");
+  }
+
+public:
+  luci::CircleFullyConnected *fc() { return _fc; }
+
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleAdd *_add = nullptr;
+  luci::CircleConst *_fc_f = nullptr;
+  luci::CircleConst *_fc_b = nullptr;
+  luci::CircleConst *_add_c = nullptr;
+};
+
+class FuseAddWithFCTestGraph : public TestIOGraph, public FCAddGraphlet
+{
+public:
+  FuseAddWithFCTestGraph() = default;
+
+  void init(void)
+  {
+    TestIOGraph::init({1, 4}, {1, 16});
+    FCAddGraphlet::init(g());
+
+    _fc->input(input());
+
+    output()->from(_add);
+  }
+};
+
+class FuseAddWithFullyConnectedPassTest : public ::testing::Test
+{
+public:
+  FuseAddWithFCTestGraph g;
+  luci::FuseAddWithFullyConnectedPass pass;
+};
+
+} // namespace
+
+TEST_F(FuseAddWithFullyConnectedPassTest, simple_test)
+{
+  g.init();
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+
+  auto bias = loco::must_cast<luci::CircleConst *>(g.fc()->bias());
+  for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+  {
+    EXPECT_EQ(i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
diff --git a/compiler/luci/pass/src/PropagateQuantParamPass.cpp b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
index 10c113574..b1cb7a418 100644
--- a/compiler/luci/pass/src/PropagateQuantParamPass.cpp
+++ b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
@@ -73,7 +73,13 @@ struct PropagateQuantParam final : public luci::CircleNodeMutableVisitor<bool>
     return copy_qparam(input_node, node);
   }
 
-  // TODO : Add more Ops (e.g., Transpose)
+  bool visit(luci::CircleTranspose *node)
+  {
+    auto input_node = loco::must_cast<luci::CircleNode *>(node->a());
+    return copy_qparam(input_node, node);
+  }
+
+  // TODO : Add more Ops (e.g., layout-changing Ops)
 };
 
 } // namespace
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
index e99c7b389..c8ad87e3d 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
@@ -358,7 +358,7 @@ bool QuantizeDequantizeWeightsPass::run(loco::Graph *g)
   // Quantize weights
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeDequantizeWeights qw(_input_dtype, _output_dtype, _granularity);
+    QuantizeDequantizeWeights qw(_input_model_dtype, _output_model_dtype, _granularity);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     circle_node->accept(&qw);
   }
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index 6afc2084f..be81732f8 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -609,6 +609,20 @@ struct QuantizeSpecialActivation final : public luci::CircleNodeMutableVisitor<v
     set_act_qparam(node, i_scale, i_zp);
   }
 
+  void visit(luci::CircleSplitVOut *node)
+  {
+    auto splitv = loco::must_cast<luci::CircleSplitV *>(node->input());
+    auto input = loco::must_cast<luci::CircleNode *>(splitv->input());
+    auto i_qparam = input->quantparam();
+    assert(i_qparam);
+    assert(i_qparam->scale.size() == 1); // FIX_CALLER_UNLESS
+    assert(i_qparam->zerop.size() == 1); // FIX_CALLER_UNLESS
+    auto i_scale = i_qparam->scale[0];
+    auto i_zp = i_qparam->zerop[0];
+
+    set_act_qparam(node, i_scale, i_zp);
+  }
+
   void visit(luci::CircleUnpackOut *node)
   {
     auto unpack = loco::must_cast<luci::CircleUnpack *>(node->input());
@@ -1157,6 +1171,7 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
     case luci::CircleOpcode::REVERSE_SEQUENCE:
     case luci::CircleOpcode::SLICE:
     case luci::CircleOpcode::SPACE_TO_BATCH_ND:
+    case luci::CircleOpcode::SPLIT_V:
     case luci::CircleOpcode::STRIDED_SLICE:
     case luci::CircleOpcode::SUM:
     case luci::CircleOpcode::TILE:
@@ -1176,6 +1191,7 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
     case luci::CircleOpcode::DIV:
     case luci::CircleOpcode::ELU:
     case luci::CircleOpcode::EQUAL:
+    case luci::CircleOpcode::EXP:
     case luci::CircleOpcode::FLOOR:
     case luci::CircleOpcode::FLOOR_DIV:
     case luci::CircleOpcode::GREATER:
@@ -1385,7 +1401,8 @@ void propagate_pad_v2_quantparam(luci::CirclePadV2 *pad_v2, loco::DataType quant
     auto pad_v2_input = loco::must_cast<luci::CircleNode *>(pad_v2->arg(0));
     overwrite_quantparam(pad_v2_input, pad_v2);
 
-    auto const_value_node = dynamic_cast<luci::CircleConst *>(pad_v2->arg(2));
+    auto const_value_node = loco::must_cast<luci::CircleConst *>(
+      pad_v2->arg(2)); // FIX ignore_pad_v2_const_quantization UNLESS
     auto new_const = luci::clone(const_value_node);
 
     const auto pad_v2_input_qparam = pad_v2_input->quantparam();
@@ -1458,7 +1475,7 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   // Quantize activation
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeActivation qa(_input_dtype, _output_dtype);
+    QuantizeActivation qa(_input_model_dtype, _output_model_dtype);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     circle_node->accept(&qa);
   }
@@ -1466,7 +1483,7 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   // Quantize weights
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeWeights qw(_input_dtype, _output_dtype, _granularity);
+    QuantizeWeights qw(_input_model_dtype, _output_model_dtype, _granularity);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     circle_node->accept(&qw);
   }
@@ -1474,7 +1491,7 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   // Quantize bias
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeBias qb(_input_dtype, _output_dtype, _granularity);
+    QuantizeBias qb(_input_model_dtype, _output_model_dtype, _granularity);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     circle_node->accept(&qb);
   }
@@ -1491,20 +1508,20 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
     // (2) concat has no fused activation function
     // (3) the input is not concatenation Op
     // (4) the input is not produced to Ops other than concat
-    propagate_concat_quantparam(concat, _output_dtype);
+    propagate_concat_quantparam(concat, _output_model_dtype);
   }
 
   // Quantize const inputs other than weights and bias
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    quantize_const_inputs(circle_node, _output_dtype);
+    quantize_const_inputs(circle_node, _output_model_dtype);
   }
 
   // Update qparam of output of special Ops
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    QuantizeSpecialActivation qsa(_input_dtype, _output_dtype);
+    QuantizeSpecialActivation qsa(_input_model_dtype, _output_model_dtype);
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     circle_node->accept(&qsa);
   }
@@ -1514,11 +1531,11 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
   for (auto node : loco::output_nodes(g))
   {
     auto circle_node = loco::must_cast<luci::CircleOutput *>(node);
-    if (static_cast<luci::CircleNode *>(circle_node->from())->dtype() == _output_dtype)
+    if (static_cast<luci::CircleNode *>(circle_node->from())->dtype() == _output_model_dtype)
     {
-      circle_node->dtype(_output_dtype);
+      circle_node->dtype(_output_model_dtype);
       auto graph_output = graph_outputs->at(circle_node->index());
-      graph_output->dtype(_output_dtype);
+      graph_output->dtype(_output_model_dtype);
     }
   }
 
diff --git a/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
index b8cc09955..3a6d86c33 100644
--- a/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
+++ b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
@@ -189,6 +189,12 @@ void set_minmax_to_non_const(loco::Graph *g, float min, float max)
     if (split_node != nullptr)
       continue;
 
+    // Min/Max is not recorded for SplitV
+    // See MinMaxObserver.cpp in record_minmax module
+    auto splitv_node = dynamic_cast<luci::CircleSplitV *>(node);
+    if (splitv_node != nullptr)
+      continue;
+
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
     auto qparam = std::make_unique<luci::CircleQuantParam>();
     {
@@ -410,6 +416,38 @@ private:
   luci::CircleConst *_split_dim = nullptr;
 };
 
+class SplitVTestGraph final : public luci::test::TestIOGraph
+{
+public:
+  void init(void)
+  {
+    TestIOGraph::init({1, 32}, {32});
+    _size_splits = create_dummy_const<Type::S32>(g(), {1});
+    _split_dim = create_dummy_const<Type::S32>(g(), {1});
+    _splitv = g()->nodes()->create<luci::CircleSplitV>();
+    {
+      _splitv->input(input());
+      _splitv->size_splits(_size_splits);
+      _splitv->split_dim(_split_dim);
+    }
+    _splitv_o1 = g()->nodes()->create<luci::CircleSplitVOut>();
+    {
+      _splitv_o1->input(_splitv);
+      _splitv_o1->index(0);
+    }
+
+    output()->from(_splitv_o1);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+private:
+  luci::CircleSplitV *_splitv = nullptr;
+  luci::CircleSplitVOut *_splitv_o1 = nullptr;
+  luci::CircleConst *_size_splits = nullptr;
+  luci::CircleConst *_split_dim = nullptr;
+};
+
 class StridedSliceTestGraph final : public SimpleTestGraph
 {
 public:
@@ -1312,6 +1350,30 @@ TEST(QuantizedModelVerifierTest, Split_wrong_granularity_NEG)
   SUCCEED();
 }
 
+TEST(QuantizedModelVerifierTest, SplitV)
+{
+  TEST_WITH_GRAPH(SplitVTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SplitVTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SplitVTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SplitV_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SplitVTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SplitVTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SplitVTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SplitV_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SplitVTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SplitVTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SplitVTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
 TEST(QuantizedModelVerifierTest, StridedSlice)
 {
   TEST_WITH_GRAPH(StridedSliceTestGraph, Type::U8, Granularity::LayerWise);
diff --git a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
index 1737e5dd6..9f7e2f17d 100644
--- a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
@@ -16,12 +16,12 @@
 
 #include "luci/Pass/ResolveCustomOpAddPass.h"
 
-#include "flatbuffers/flexbuffers.h"
-
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/AttrFusedActFunc.h>
 #include <luci/Profile/CircleNodeOrigin.h>
 
+#include <flatbuffers/flexbuffers.h>
+
 namespace
 {
 
diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
index 5e9466a63..7ebd7a429 100644
--- a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
@@ -16,11 +16,11 @@
 
 #include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
 
-#include "flatbuffers/flexbuffers.h"
-
 #include <luci/IR/CircleNodes.h>
 #include <luci/Profile/CircleNodeOrigin.h>
 
+#include <flatbuffers/flexbuffers.h>
+
 namespace
 {
 
diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp
index 435016f9d..7ef61c253 100644
--- a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp
@@ -18,12 +18,11 @@
 
 #include <luci/IR/CircleNodes.h>
 
-#include "flatbuffers/flatbuffers.h"
-#include "flatbuffers/flexbuffers.h"
-
 #include <luci/test/TestIOGraph.h>
 
 #include <gtest/gtest.h>
+#include <flatbuffers/flatbuffers.h>
+#include <flatbuffers/flexbuffers.h>
 
 namespace
 {
diff --git a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
index 216778066..1e8f681c8 100644
--- a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
@@ -16,7 +16,6 @@
 
 #include "luci/Pass/ResolveCustomOpMatMulPass.h"
 
-#include "flatbuffers/flexbuffers.h"
 #include <loco/IR/DataTypeTraits.h>
 
 #include <luci/IR/CircleNodes.h>
@@ -25,6 +24,8 @@
 #include <loco.h>
 #include <oops/InternalExn.h>
 
+#include <flatbuffers/flexbuffers.h>
+
 namespace
 {
 
diff --git a/compiler/luci/pass/src/ResolveCustomOpMaxPoolWithArgmaxPass.cpp b/compiler/luci/pass/src/ResolveCustomOpMaxPoolWithArgmaxPass.cpp
index d78a587ac..f37f27742 100644
--- a/compiler/luci/pass/src/ResolveCustomOpMaxPoolWithArgmaxPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpMaxPoolWithArgmaxPass.cpp
@@ -16,7 +16,6 @@
 
 #include "luci/Pass/ResolveCustomOpMaxPoolWithArgmaxPass.h"
 
-#include "flatbuffers/flexbuffers.h"
 #include <loco/IR/DataTypeTraits.h>
 
 #include <luci/IR/CircleNodes.h>
@@ -25,6 +24,8 @@
 #include <loco.h>
 #include <oops/InternalExn.h>
 
+#include <flatbuffers/flexbuffers.h>
+
 namespace
 {
 
diff --git a/compiler/luci/pass/src/SubstituteSplitVToSplitPass.cpp b/compiler/luci/pass/src/SubstituteSplitVToSplitPass.cpp
new file mode 100644
index 000000000..9cba9a9e7
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteSplitVToSplitPass.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SubstituteSplitVToSplitPass.h"
+
+#include <loco.h>
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+
+void copy_quantparam(luci::CircleNode *dst, const luci::CircleNode *src)
+{
+  auto q = src->quantparam();
+  if (q == nullptr)
+    dst->quantparam(nullptr);
+  else
+    dst->quantparam(std::make_unique<luci::CircleQuantParam>(*q));
+}
+
+// SplitV is substituted to Split if the contents of size_splits are all same
+// For example,
+// size_splits = [32, 32] -> substitute
+// size_splits = [31, 33] -> do not substitute
+bool resolve_splitv(luci::CircleSplitV *sv)
+{
+  auto size_splits = dynamic_cast<luci::CircleConst *>(sv->size_splits());
+  if (not size_splits)
+    return false;
+
+  if (size_splits->dtype() != loco::DataType::S32)
+    return false;
+
+  auto num_split = size_splits->size<loco::DataType::S32>();
+  if (static_cast<int32_t>(num_split) != sv->num_split())
+    return false;
+
+  if (num_split < 1)
+    return false;
+
+  // Check the contents of size_splits are all same
+  auto first_size = size_splits->at<loco::DataType::S32>(0);
+  for (uint32_t i = 1; i < num_split; i++)
+  {
+    if (first_size != size_splits->at<loco::DataType::S32>(i))
+      return false;
+  }
+
+  auto graph = sv->graph();
+  auto split_node = graph->nodes()->create<luci::CircleSplit>();
+  split_node->input(sv->input());
+  split_node->split_dim(sv->split_dim());
+  split_node->num_split(sv->num_split());
+  split_node->name(sv->name());
+  copy_quantparam(split_node, sv);
+  luci::add_origin(split_node, luci::get_origin(sv));
+
+  auto succs = loco::succs(sv);
+  for (auto succ : succs)
+  {
+    auto svo = loco::must_cast<luci::CircleSplitVOut *>(succ);
+    auto so_node = graph->nodes()->create<luci::CircleSplitOut>();
+    so_node->input(split_node);
+    so_node->index(svo->index());
+    so_node->name(svo->name());
+    copy_quantparam(so_node, svo);
+    luci::add_origin(so_node, luci::get_origin(svo));
+
+    replace(svo).with(so_node);
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ *  EXAMPLE (SplitV with num_split = 2)
+ *
+ *  BEFORE
+ *              [CircleNode]
+ *                   |
+ *             [CircleSplitV] (size_splits and split_dim are ignored)
+ *                /      \
+ *   [CircleSplitVOut]  [CircleSplitVOut]
+ *            |                 |
+ *       [CircleNode]     [CircleNode]
+ *
+ *  AFTER
+ *                    [CircleNode]
+ *                     /         \
+ *             [CircleSplit]    [CircleSplitV] (dead)
+ *                /      \               \
+ *   [CircleSplitOut]  [CircleSplitOut]  [CircleSplitVOut] * 2 (dead)
+ *            |                 |
+ *       [CircleNode]     [CircleNode]
+ */
+bool SubstituteSplitVToSplitPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto sv = dynamic_cast<luci::CircleSplitV *>(node))
+    {
+      if (resolve_splitv(sv))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/SubstituteSplitVToSplitPass.test.cpp b/compiler/luci/pass/src/SubstituteSplitVToSplitPass.test.cpp
new file mode 100644
index 000000000..6e30103f9
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteSplitVToSplitPass.test.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SubstituteSplitVToSplitPass.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+const int N = 1;
+const int C = 32;
+const int H = 8;
+const int W = 8;
+
+// Reduce duplicate codes in ResolveCustomOpMatMulPass.cpp
+template <typename T>
+luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
+                                     const std::vector<uint32_t> &shape,
+                                     const std::vector<T> &values)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  node->dtype(dtype);
+  node->rank(shape.size());
+
+  uint32_t size = 1;
+  for (uint32_t i = 0; i < shape.size(); ++i)
+  {
+    node->dim(i) = shape.at(i);
+    size *= shape.at(i);
+  }
+  node->shape_status(luci::ShapeStatus::VALID);
+
+#define INIT_VALUES(DT)                          \
+  {                                              \
+    node->size<DT>(size);                        \
+    for (uint32_t i = 0; i < values.size(); ++i) \
+      node->at<DT>(i) = values[i];               \
+  }
+
+  switch (dtype)
+  {
+    case loco::DataType::U8:
+      INIT_VALUES(loco::DataType::U8);
+      break;
+    case loco::DataType::S16:
+      INIT_VALUES(loco::DataType::S16);
+      break;
+    case loco::DataType::S32:
+      INIT_VALUES(loco::DataType::S32);
+      break;
+    case loco::DataType::FLOAT32:
+      INIT_VALUES(loco::DataType::FLOAT32)
+      break;
+    default:
+      INTERNAL_EXN("create_const_node called with unsupported type");
+      break;
+  }
+  return node;
+}
+/**
+ *  graph having SplitV operator
+ *
+ *                [CircleInput]
+ *                      |
+ *                [CircleSplitV]
+ *                     /  \
+ *      [CircleSplitVOut] [CircleSplitVOut]
+ *             |                   |
+ *       [CircleOutput]     [CircleOutput]
+ */
+class SplitVGraphlet
+{
+public:
+  SplitVGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    const std::vector<int32_t> splits{16, 16};
+    auto size_splits = create_const_node(g, loco::DataType::S32, {2}, splits);
+
+    const std::vector<int32_t> dim{3};
+    auto split_dim = create_const_node(g, loco::DataType::S32, {1}, dim);
+
+    _sv = g->nodes()->create<luci::CircleSplitV>();
+    _sv->size_splits(size_splits);
+    _sv->split_dim(split_dim);
+    _sv->num_split(2);
+    _sv->name("SplitV");
+
+    _svo1 = g->nodes()->create<luci::CircleSplitVOut>();
+    _svo1->input(_sv);
+    _svo1->index(0);
+    _svo1->name("SplitV0");
+
+    _svo2 = g->nodes()->create<luci::CircleSplitVOut>();
+    _svo2->input(_sv);
+    _svo2->index(1);
+    _svo2->name("SplitV1");
+  }
+
+public:
+  luci::CircleSplitV *split_v() { return _sv; }
+  luci::CircleSplitVOut *split_vo1() { return _svo1; }
+  luci::CircleSplitVOut *split_vo2() { return _svo2; }
+
+protected:
+  luci::CircleSplitV *_sv = nullptr;
+  luci::CircleSplitVOut *_svo1 = nullptr;
+  luci::CircleSplitVOut *_svo2 = nullptr;
+};
+
+class SplitVGraph : public TestIsGraphlet<1>, public TestOsGraphlet<2>, public SplitVGraphlet
+{
+public:
+  SplitVGraph() = default;
+
+  void init(void)
+  {
+    TestIsGraphlet<1>::init(g(), {{N, C, H, W}});
+    TestOsGraphlet<2>::init(g(), {{N, C, H / 2, W / 2}, {N, C, H / 2, W / 2}});
+    SplitVGraphlet::init(g());
+
+    split_v()->input(input(0));
+
+    output(0)->from(split_vo1());
+    output(1)->from(split_vo2());
+  }
+};
+
+class SubstituteSplitVToSplitPassTest : public ::testing::Test
+{
+public:
+  SplitVGraph g;
+  luci::SubstituteSplitVToSplitPass pass;
+};
+
+} // namespace
+
+/**
+ *  Optimized graph looks like below.
+ *
+ *                [CircleInput]
+ *                      |
+ *                [CircleSplit]
+ *                     /  \
+ *      [CircleSplitOut] [CircleSplitOut]
+ *             |                 |
+ *       [CircleOutput]   [CircleOutput]
+ */
+TEST_F(SubstituteSplitVToSplitPassTest, simple_test)
+{
+  g.init();
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto so1 = dynamic_cast<luci::CircleSplitOut *>(g.output(0)->from());
+  EXPECT_NE(nullptr, so1);
+
+  auto so2 = dynamic_cast<luci::CircleSplitOut *>(g.output(1)->from());
+  EXPECT_NE(nullptr, so2);
+
+  EXPECT_EQ(so1->input(), so2->input());
+
+  auto s = dynamic_cast<luci::CircleSplit *>(so1->input());
+  EXPECT_NE(nullptr, s);
+
+  auto input = dynamic_cast<luci::CircleInput *>(s->input());
+  EXPECT_NE(nullptr, input);
+}
+
+TEST_F(SubstituteSplitVToSplitPassTest, wrong_condition_NEG)
+{
+  g.init();
+
+  g.split_v()->num_split(3); // Wrong num_split
+  auto ret = pass.run(g.g());
+
+  EXPECT_EQ(false, ret);
+}
diff --git a/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp
index 74be86a4c..f48763782 100644
--- a/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp
+++ b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp
@@ -76,6 +76,18 @@ std::vector<uint32_t> node_shape(const luci::CircleNode *input)
 }
 
 /**
+ * @brief copy quantparam of src to dst
+ */
+void copy_quantparam(luci::CircleNode *dst, const luci::CircleNode *src)
+{
+  auto q = src->quantparam();
+  if (q == nullptr)
+    dst->quantparam(nullptr);
+  else
+    dst->quantparam(std::make_unique<luci::CircleQuantParam>(*q));
+}
+
+/**
  * @brief return CircleConst ptr with values of new_shape
  */
 luci::CircleConst *create_shape_const(loco::Graph *graph, const std::vector<uint32_t> &new_shape)
@@ -130,6 +142,7 @@ bool substitute_squeeze_to_reshape(luci::CircleSqueeze *squeeze)
   auto graph = squeeze->graph();
   auto reshape = graph->nodes()->create<luci::CircleReshape>();
   auto shape_const = create_shape_const(graph, reshape_shape);
+  copy_quantparam(reshape, squeeze);
   reshape->name(name + "/Reshape");
   luci::add_origin(reshape, luci::get_origin(squeeze));
   shape_const->name(name + "/Reshape/shape");
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h
index 1706b9e43..bf3ff2e8a 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h
@@ -324,6 +324,19 @@ private:
     return true;
   }
 
+  bool visit(const luci::CircleSplitV *node)
+  {
+    // node's output is the input of CircleSplitVOut, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitVOut *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    return true;
+  }
+
   bool visit(const luci::CircleStridedSlice *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node));
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h
index 3954bf216..9bc8b31df 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h
@@ -310,6 +310,19 @@ private:
     return true;
   }
 
+  bool visit(const luci::CircleSplitV *node)
+  {
+    // node's output is the input of CircleSplitVOut, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitVOut *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    return true;
+  }
+
   bool visit(const luci::CircleStridedSlice *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node));
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h b/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h
index 560abd2ff..eeec7b82b 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h
@@ -310,6 +310,26 @@ private:
     return true;
   }
 
+  bool visit(const luci::CircleSplitV *node)
+  {
+    // node's output is the input of CircleSplitVOut, thus not quantized
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitVOut *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+
+    // SplitVOut has the same qparam with the input of SplitV
+    auto splitv = loco::must_cast<luci::CircleSplitV *>(node->input());
+    auto input = loco::must_cast<luci::CircleNode *>(splitv->input());
+    RETURN_FALSE_UNLESS(node->quantparam());
+    RETURN_FALSE_UNLESS(node->quantparam()->scale[0] == input->quantparam()->scale[0]);
+    RETURN_FALSE_UNLESS(node->quantparam()->zerop[0] == input->quantparam()->zerop[0]);
+    return true;
+  }
+
   bool visit(const luci::CircleStridedSlice *node)
   {
     RETURN_FALSE_UNLESS(has_type(node, Type::S16))
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h b/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h
index 42cd1ce55..e7dd1b072 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h
@@ -317,6 +317,26 @@ private:
     return true;
   }
 
+  bool visit(const luci::CircleSplitV *node)
+  {
+    // node's output is the input of CircleSplitVOut, thus not quantized
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitVOut *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+
+    // SplitVOut has the same qparam with the input of SplitV
+    auto splitv = loco::must_cast<luci::CircleSplitV *>(node->input());
+    auto input = loco::must_cast<luci::CircleNode *>(splitv->input());
+    RETURN_FALSE_UNLESS(node->quantparam());
+    RETURN_FALSE_UNLESS(node->quantparam()->scale[0] == input->quantparam()->scale[0]);
+    RETURN_FALSE_UNLESS(node->quantparam()->zerop[0] == input->quantparam()->zerop[0]);
+    return true;
+  }
+
   bool visit(const luci::CircleStridedSlice *node)
   {
     RETURN_FALSE_UNLESS(has_type(node, Type::U8))
diff --git a/compiler/luci/plan/CMakeLists.txt b/compiler/luci/plan/CMakeLists.txt
new file mode 100644
index 000000000..9ca6dcb41
--- /dev/null
+++ b/compiler/luci/plan/CMakeLists.txt
@@ -0,0 +1,15 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+if (NOT LIBRARY_TYPE)
+    set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_plan ${LIBRARY_TYPE} ${SOURCES})
+target_include_directories(luci_plan PRIVATE src)
+target_include_directories(luci_plan PUBLIC include)
+target_link_libraries(luci_plan PUBLIC loco)
+target_link_libraries(luci_plan PUBLIC luci_lang)
+
+install(TARGETS luci_plan DESTINATION lib)
+install(DIRECTORY include/ DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
diff --git a/compiler/luci/plan/include/luci/Plan/CircleNodeExecutionPlan.h b/compiler/luci/plan/include/luci/Plan/CircleNodeExecutionPlan.h
new file mode 100644
index 000000000..fe966e35e
--- /dev/null
+++ b/compiler/luci/plan/include/luci/Plan/CircleNodeExecutionPlan.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_NODE_EXECUTION_PLAN_H__
+#define __LUCI_CIRCLE_NODE_EXECUTION_PLAN_H__
+
+#include <luci/IR/CircleNode.h>
+
+#include <utility>
+
+namespace luci
+{
+
+class CircleNodeExecutionPlan
+{
+public:
+  CircleNodeExecutionPlan() = delete;
+
+  CircleNodeExecutionPlan(uint32_t order_in_plan, std::vector<uint32_t> offsets)
+  {
+    _order_in_plan = order_in_plan;
+    _offsets = std::move(offsets);
+  }
+
+  uint32_t order_in_plan(void) const { return _order_in_plan; }
+  void order_in_plan(const uint32_t &order_in_plan) { _order_in_plan = order_in_plan; }
+
+  std::vector<uint32_t> offsets(void) const { return _offsets; }
+  void offsets(const std::vector<uint32_t> &offsets) { _offsets = offsets; }
+
+private:
+  uint32_t _order_in_plan = 0;
+  std::vector<uint32_t> _offsets;
+};
+
+bool has_execution_plan(const luci::CircleNode *circle_node);
+
+void add_execution_plan(luci::CircleNode *circle_node,
+                        const luci::CircleNodeExecutionPlan &execution_plan);
+
+luci::CircleNodeExecutionPlan get_execution_plan(const luci::CircleNode *circle_node);
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_NODE_EXECUTION_PLAN_H__
diff --git a/compiler/luci/plan/src/CircleNodeExecutionPlan.cpp b/compiler/luci/plan/src/CircleNodeExecutionPlan.cpp
new file mode 100644
index 000000000..a02ebc452
--- /dev/null
+++ b/compiler/luci/plan/src/CircleNodeExecutionPlan.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Plan/CircleNodeExecutionPlan.h"
+
+#include <loco.h>
+
+#include <stdexcept>
+#include <utility>
+
+namespace
+{
+
+/**
+ * @brief Set annotation for circle node execution plan
+ * @note  Once CircleExecutionPlanAnnotation is annotated, it should not be changed.
+ *        If CircleExecutionPlanAnnotation is needed to be changed, create
+ *        new CircleExecutionPlanAnnotation.
+ */
+class CircleExecutionPlanAnnotation final : public loco::NodeAnnotation
+{
+public:
+  CircleExecutionPlanAnnotation() = delete;
+
+  explicit CircleExecutionPlanAnnotation(luci::CircleNodeExecutionPlan execution_plan)
+    : _execution_plan{std::move(execution_plan)}
+  {
+    // Do nothing
+  }
+
+public:
+  const luci::CircleNodeExecutionPlan &execution_plan(void) const { return _execution_plan; }
+  // No setter
+
+private:
+  luci::CircleNodeExecutionPlan _execution_plan;
+};
+
+} // namespace
+
+namespace luci
+{
+
+bool has_execution_plan(const luci::CircleNode *circle_node)
+{
+  return circle_node->annot<CircleExecutionPlanAnnotation>() != nullptr;
+}
+
+void add_execution_plan(luci::CircleNode *circle_node,
+                        const luci::CircleNodeExecutionPlan &execution_plan)
+{
+  circle_node->annot<CircleExecutionPlanAnnotation>(nullptr);
+  circle_node->annot(std::make_unique<CircleExecutionPlanAnnotation>(execution_plan));
+}
+
+luci::CircleNodeExecutionPlan get_execution_plan(const luci::CircleNode *circle_node)
+{
+  if (!has_execution_plan(circle_node))
+    throw std::runtime_error("Cannot find CircleNodeExecutionPlanAnnotation");
+
+  return circle_node->annot<CircleExecutionPlanAnnotation>()->execution_plan();
+}
+
+} // namespace luci
diff --git a/compiler/luci/profile/CMakeLists.txt b/compiler/luci/profile/CMakeLists.txt
index fdfcaf1de..ae604ab90 100644
--- a/compiler/luci/profile/CMakeLists.txt
+++ b/compiler/luci/profile/CMakeLists.txt
@@ -2,7 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_profile SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_profile ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_profile PRIVATE src)
 target_include_directories(luci_profile PUBLIC include)
 target_link_libraries(luci_profile PUBLIC loco)
diff --git a/compiler/luci/requires.cmake b/compiler/luci/requires.cmake
index 687bf573a..3ccc58128 100644
--- a/compiler/luci/requires.cmake
+++ b/compiler/luci/requires.cmake
@@ -5,6 +5,7 @@ require("locop")
 require("logo")
 require("logo-core")
 require("mio-circle")
+require("mio-tflite")
 require("oops")
 require("hermes")
 require("hermes-std")
diff --git a/compiler/luci/service/CMakeLists.txt b/compiler/luci/service/CMakeLists.txt
index 781e6d6de..f48210b9c 100644
--- a/compiler/luci/service/CMakeLists.txt
+++ b/compiler/luci/service/CMakeLists.txt
@@ -2,7 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 file(GLOB_RECURSE TESTS "src/*.test.cpp")
 list(REMOVE_ITEM SOURCES ${TESTS})
 
-add_library(luci_service SHARED ${SOURCES})
+if (NOT LIBRARY_TYPE)
+  set(LIBRARY_TYPE "SHARED")
+endif(NOT LIBRARY_TYPE)
+
+add_library(luci_service ${LIBRARY_TYPE} ${SOURCES})
 target_include_directories(luci_service PRIVATE src)
 target_include_directories(luci_service PUBLIC include)
 target_link_libraries(luci_service PUBLIC luci_lang)
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index fade2cbd0..5f6d46f2b 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -314,8 +314,7 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return input_type;
   }
 
-  // TODO support S16
-  loco::DataType visit(const luci::CircleQuantize *) final { return loco::DataType::U8; }
+  loco::DataType visit(const luci::CircleQuantize *node) final { return luci::dtype_get(node); }
 
   loco::DataType visit(const luci::CircleRange *node) final
   {
diff --git a/compiler/mio-circle/CMakeLists.txt b/compiler/mio-circle/CMakeLists.txt
index 9c1126d6f..fa05ef0fa 100644
--- a/compiler/mio-circle/CMakeLists.txt
+++ b/compiler/mio-circle/CMakeLists.txt
@@ -1,4 +1,4 @@
-nnas_find_package(FlatBuffers QUIET)
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
 
 if(NOT FlatBuffers_FOUND)
   return()
diff --git a/compiler/mio-tflite/CMakeLists.txt b/compiler/mio-tflite/CMakeLists.txt
index 9ef2859b9..4660e4003 100644
--- a/compiler/mio-tflite/CMakeLists.txt
+++ b/compiler/mio-tflite/CMakeLists.txt
@@ -1,4 +1,4 @@
-nnas_find_package(FlatBuffers QUIET)
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
 
 if(NOT FlatBuffers_FOUND)
   message(STATUS "Build mio-tflite: FAILED (missing Flatbuffers)")
@@ -36,3 +36,13 @@ target_link_libraries(mio_tflite_example mio_tflite)
 # TODO provide full tflite validation with runtime/interpreter
 add_executable(mio_tflite_validate example.cpp)
 target_link_libraries(mio_tflite_validate mio_tflite)
+
+nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.3.0 QUIET)
+
+if(NOT TensorFlowGEMMLowpSource_FOUND)
+  return()
+endif(NOT TensorFlowGEMMLowpSource_FOUND)
+
+add_library(mio_tflite_inc INTERFACE)
+target_include_directories(mio_tflite_inc SYSTEM INTERFACE "${TensorFlowSource_DIR}")
+target_include_directories(mio_tflite_inc SYSTEM INTERFACE "${TensorFlowGEMMLowpSource_DIR}")
diff --git a/compiler/mio-tflite260/CMakeLists.txt b/compiler/mio-tflite260/CMakeLists.txt
new file mode 100644
index 000000000..39f4d9a31
--- /dev/null
+++ b/compiler/mio-tflite260/CMakeLists.txt
@@ -0,0 +1,49 @@
+nnas_find_package(FlatBuffers EXACT 1.12 QUIET)
+
+if(NOT FlatBuffers_FOUND)
+  message(STATUS "Build mio-tflite260: FAILED (missing Flatbuffers 1.12)")
+  return()
+endif(NOT FlatBuffers_FOUND)
+
+nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+
+if(NOT TensorFlowSource_FOUND)
+  message(STATUS "Build mio-tflite260: FAILED (missing TensorFlowSource 2.6.0)")
+  return()
+endif(NOT TensorFlowSource_FOUND)
+
+message(STATUS "Build mio-tflite260: TRUE")
+
+set(SCHEMA_FILE "${TensorFlowSource_DIR}/tensorflow/lite/schema/schema.fbs")
+
+# NOTE Use copy of schema.fbs as to provide unified way for circle also
+add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/schema.fbs"
+  COMMAND ${CMAKE_COMMAND} -E copy "${SCHEMA_FILE}" schema.fbs
+  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+  DEPENDS "${SCHEMA_FILE}"
+)
+
+FlatBuffers_Target(mio_tflite260
+  OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/gen/mio/tflite"
+  INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/gen"
+  SCHEMA_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  SCHEMA_FILES "schema.fbs"
+)
+
+add_executable(mio_tflite260_example example.cpp)
+target_link_libraries(mio_tflite260_example mio_tflite260)
+
+# Temporay tflite validation tool to replace nnkit-tflite
+# TODO provide full tflite validation with runtime/interpreter
+add_executable(mio_tflite260_validate example.cpp)
+target_link_libraries(mio_tflite260_validate mio_tflite260)
+
+nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+
+if(NOT TensorFlowGEMMLowpSource_FOUND)
+  return()
+endif(NOT TensorFlowGEMMLowpSource_FOUND)
+
+add_library(mio_tflite260_inc INTERFACE)
+target_include_directories(mio_tflite260_inc SYSTEM INTERFACE "${TensorFlowSource_DIR}")
+target_include_directories(mio_tflite260_inc SYSTEM INTERFACE "${TensorFlowGEMMLowpSource_DIR}")
diff --git a/compiler/mio-tflite260/README.md b/compiler/mio-tflite260/README.md
new file mode 100644
index 000000000..970569b47
--- /dev/null
+++ b/compiler/mio-tflite260/README.md
@@ -0,0 +1,3 @@
+# mio-tflite260
+
+_mio-tflite260_ provides a library to access TensorFlow lite model files with V2.6.0.
diff --git a/compiler/mio-tflite260/example.cpp b/compiler/mio-tflite260/example.cpp
new file mode 100644
index 000000000..2787a3c2d
--- /dev/null
+++ b/compiler/mio-tflite260/example.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// This example shows how to include and use "mio-tflite260"
+//
+#include <mio/tflite/schema_generated.h>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+int main(int argc, char **argv)
+{
+  std::ifstream ifs(argv[1], std::ios_base::binary);
+  std::vector<char> buf(std::istreambuf_iterator<char>{ifs}, std::istreambuf_iterator<char>{});
+
+  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(buf.data()), buf.size()};
+
+  if (!tflite::VerifyModelBuffer(verifier))
+  {
+    std::cout << "Fail" << std::endl;
+    return 255;
+  }
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/compiler/mir/src/mir_tflite_importer/CMakeLists.txt b/compiler/mir/src/mir_tflite_importer/CMakeLists.txt
index 952857c86..42eb4f8a5 100644
--- a/compiler/mir/src/mir_tflite_importer/CMakeLists.txt
+++ b/compiler/mir/src/mir_tflite_importer/CMakeLists.txt
@@ -1,4 +1,4 @@
-nnas_find_package(FlatBuffers REQUIRED)
+nnas_find_package(FlatBuffers EXACT 1.10 REQUIRED)
 
 if (NOT FlatBuffers_FOUND)
     return()
diff --git a/compiler/one-cmds/CMakeLists.txt b/compiler/one-cmds/CMakeLists.txt
index fc89f4da5..729bfa80a 100644
--- a/compiler/one-cmds/CMakeLists.txt
+++ b/compiler/one-cmds/CMakeLists.txt
@@ -41,7 +41,6 @@ set(ONE_UTILITY_FILES
     one-build.template.cfg
     onecc.template.cfg
     utils.py
-    conv_mixin_1.8.0.patch
 )
 
 foreach(ONE_UTILITY IN ITEMS ${ONE_UTILITY_FILES})
diff --git a/compiler/one-cmds/conv_mixin_1.8.0.patch b/compiler/one-cmds/conv_mixin_1.8.0.patch
deleted file mode 100644
index 96a0f41cf..000000000
--- a/compiler/one-cmds/conv_mixin_1.8.0.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/onnx_tf/handlers/backend/conv_mixin.py
-+++ b/onnx_tf/handlers/backend/conv_mixin.py
-@@ -98,7 +98,7 @@
-     depthwise = (x_rank == 4 and len(weight_shape) == 4 and group != 1 and
-                  not transpose and not (None in weight_shape))
-     if depthwise and isinstance(x_shape, np.ndarray):
--      depthwise = group == x_shape[1]
-+      depthwise = bool(group == x_shape[1])
- 
-     if depthwise is True:
-       # Depthwise convolution.
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index f86709489..0a0c4b14c 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -150,11 +150,14 @@ one-optimize provides network or operator transformation shown below.
 
 Current transformation options are
 - disable_validation : This will turn off operator validations.
+- expand_broadcast_const : This will expand broadcastable constant node inputs
 - fold_add_v2 : This removes AddV2 operation which can be folded
 - fold_cast : This removes Cast operation which can be folded
 - fold_dequantize : This removes Dequantize operation which can be folded
+- fold_dwconv : This folds Depthwise Convolution operation which can be folded
 - fold_sparse_to_dense : This removes SparseToDense operation which can be folded
 - forward_reshape_to_unaryop: This will move Reshape after UnaryOp for centain condition
+- fuse_add_with_fully_connected: This fuses Add operator with the preceding FullyConnected operator if possible
 - fuse_add_with_tconv: This fuses Add operator with the preceding TConv operator if possible
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
@@ -192,6 +195,8 @@ Current transformation options are
 - shuffle_weight_to_16x1float32 : This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32.
   Note that it only converts weights whose row is a multiple of 16.
 - substitute_pack_to_reshape : This will convert single input Pack to Reshape.
+- substitute_padv2_to_pad : This will convert certain condition PadV2 to Pad.
+- substitute_splitv_to_split : This will convert certain condition SplitV to Split.
 - substitute_squeeze_to_reshape : This will convert certain condition Squeeze to Reshape.
 - substitute_strided_slice_to_reshape : This will convert certain condition StridedSlice to Reshape.
 - substitute_transpose_to_reshape : This will convert certain condition Transpose to Reshape.
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index a496a54ec..726538d44 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -28,6 +28,7 @@ import os
 import subprocess
 import sys
 import tempfile
+import shutil
 
 import utils as _utils
 
@@ -49,6 +50,7 @@ def _get_backends_list():
     The list where `one-codegen` finds its backends
     - `bin` folder where `one-codegen` exists
     - `backends` folder
+    - System path
 
     NOTE If there are backends of the same name in different places,
      the closer to the top in the list, the higher the priority.
@@ -151,6 +153,10 @@ def main():
         if ntpath.basename(cand) == backend_base:
             codegen_path = cand
     if not codegen_path:
+        # Find backend from system path
+        codegen_path = shutil.which(backend_base)
+
+    if not codegen_path:
         raise FileNotFoundError(backend_base + ' not found')
     codegen_cmd = [codegen_path] + backend_args + unknown_args
     if _utils._is_valid_attr(args, 'command'):
diff --git a/compiler/one-cmds/one-prepare-venv b/compiler/one-cmds/one-prepare-venv
index fbc3a75de..285191761 100644
--- a/compiler/one-cmds/one-prepare-venv
+++ b/compiler/one-cmds/one-prepare-venv
@@ -34,8 +34,8 @@ fi
 # - https://github.com/onnx/onnx-tensorflow/blob/master/Versioning.md
 
 VER_TENSORFLOW=2.3.0
-VER_ONNX=1.8.0
-VER_ONNX_TF=1.8.0
+VER_ONNX=1.10.1
+VER_ONNX_TF=1.9.0
 
 # Install tensorflow
 
@@ -61,7 +61,7 @@ ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install tensorflow-cpu==${VER_TENSORFLOW}
 ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install Pillow==6.2.2
 
 # Install PyTorch and ONNX related
-${VENV_PYTHON} -m pip ${PIP_OPTIONS} install torch==1.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+${VENV_PYTHON} -m pip ${PIP_OPTIONS} install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
 
 # Provide install of custom onnx-tf
 if [ -n "${EXT_ONNX_TF_WHL}" ]; then
@@ -69,23 +69,3 @@ if [ -n "${EXT_ONNX_TF_WHL}" ]; then
 else
   ${VENV_PYTHON} -m pip ${PIP_OPTIONS} install onnx==${VER_ONNX} onnx-tf==${VER_ONNX_TF}
 fi
-
-# TODO remove this patch after onnx-tf next release
-# apply patch for DWConv conversion bug: https://github.com/onnx/onnx-tensorflow/pull/905
-if [[ -z "${EXT_ONNX_TF_WHL}" ]]; then
-  PY_SITE_PACKAGES=$(${VENV_PYTHON} -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
-  if [[ -d ${PY_SITE_PACKAGES} ]]; then
-    pushd ${PY_SITE_PACKAGES} > /dev/null
-    PATCH_TARGET_FILE=onnx_tf/handlers/backend/conv_mixin.py
-    if [[ -f "${PATCH_TARGET_FILE}" ]]; then
-      # if patch is already applied, error code is 1
-      # catch error code and check if this is the case
-      set +e
-      patch -t -N -p1 < ${DRIVER_PATH}/conv_mixin_1.8.0.patch
-      ret_code=$?
-      [[ $ret_code -gt 1 ]] && exit $ret_code
-      set -e
-    fi
-    popd > /dev/null
-  fi
-fi
diff --git a/compiler/one-cmds/one-profile b/compiler/one-cmds/one-profile
index 798cc756c..ed6d8bd7a 100644
--- a/compiler/one-cmds/one-profile
+++ b/compiler/one-cmds/one-profile
@@ -157,14 +157,7 @@ def main():
         profile_cmd += getattr(args, 'command').split()
 
     # run backend driver
-    with subprocess.Popen(
-            profile_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-            bufsize=1) as p:
-        for line in p.stdout:
-            sys.stdout.buffer.write(line)
-            sys.stdout.buffer.flush()
-    if p.returncode != 0:
-        sys.exit(p.returncode)
+    _utils._run(profile_cmd, err_prefix=backend_base)
 
 
 if __name__ == '__main__':
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index 25ef17ab1..cd623a6f8 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -88,6 +88,17 @@ def _get_parser():
         type=str,
         help='record mode (supported: percentile/moving_average, default=percentile)')
 
+    # arguments for force_quantparam
+    parser.add_argument(
+        '--force_quantparam',
+        action='store_true',
+        help='write quantparam to the specified tensor')
+    parser.add_argument(
+        '--tensor_name', type=str, action='append', help='tensor name (string)')
+    parser.add_argument('--scale', type=float, action='append', help='scale (float)')
+    parser.add_argument(
+        '--zero_point', type=int, action='append', help='zero point (int)')
+
     return parser
 
 
@@ -114,8 +125,22 @@ def _verify_arg(parser, args):
         missing.append('-i/--input_path')
     if not _utils._is_valid_attr(args, 'output_path'):
         missing.append('-o/--output_path')
+    if _utils._is_valid_attr(args, 'force_quantparam'):
+        if not _utils._is_valid_attr(args, 'tensor_name'):
+            missing.append('--tensor_name')
+        if not _utils._is_valid_attr(args, 'scale'):
+            missing.append('--scale')
+        if not _utils._is_valid_attr(args, 'zero_point'):
+            missing.append('--zero_point')
     if len(missing):
         parser.error('the following arguments are required: ' + ' '.join(missing))
+    if _utils._is_valid_attr(args, 'force_quantparam'):
+        tensors = getattr(args, 'tensor_name')
+        scales = getattr(args, 'scale')
+        zerops = getattr(args, 'zero_point')
+        if len(tensors) != len(scales) or len(tensors) != len(zerops):
+            parser.error(
+                'The same number of tensor_name, scale, and zero_point should be given.')
 
 
 def _parse_arg(parser):
@@ -128,6 +153,11 @@ def _parse_arg(parser):
 
 
 def _quantize(args):
+    if _utils._is_valid_attr(args, 'force_quantparam'):
+        # write quantization parameters
+        _write_qparam(args)
+        return
+
     # get file path to log
     dir_path = os.path.dirname(os.path.realpath(__file__))
     logfile_path = os.path.realpath(args.output_path) + '.log'
@@ -233,6 +263,43 @@ def _quantize(args):
         _utils._run(circle_quantizer_cmd, err_prefix="circle_quantizer", logfile=f)
 
 
+def _write_qparam(args):
+    # get file path to log
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    logfile_path = os.path.realpath(args.output_path) + '.log'
+
+    with open(logfile_path, 'wb') as f:
+        # get driver path
+        circle_quantizer_path = os.path.join(dir_path, 'circle-quantizer')
+
+        # make a command to write qparams to the tensors
+        circle_quantizer_cmd = [circle_quantizer_path]
+        # verbose
+        if _utils._is_valid_attr(args, 'verbose'):
+            circle_quantizer_cmd.append('--verbose')
+        if _utils._is_valid_attr(args, 'tensor_name'):
+            tensor_name = getattr(args, 'tensor_name')
+        if _utils._is_valid_attr(args, 'scale'):
+            scale = getattr(args, 'scale')
+        if _utils._is_valid_attr(args, 'zero_point'):
+            zero_point = getattr(args, 'zero_point')
+        for (t, s, zp) in zip(tensor_name, scale, zero_point):
+            circle_quantizer_cmd.append('--force_quantparam')
+            circle_quantizer_cmd.append(t)
+            circle_quantizer_cmd.append(str(s))
+            circle_quantizer_cmd.append(str(zp))
+        # input and output path
+        if _utils._is_valid_attr(args, 'input_path'):
+            circle_quantizer_cmd.append(getattr(args, 'input_path'))
+        if _utils._is_valid_attr(args, 'output_path'):
+            circle_quantizer_cmd.append(getattr(args, 'output_path'))
+
+        f.write((' '.join(circle_quantizer_cmd) + '\n').encode())
+
+        # run circle-quantizer
+        _utils._run(circle_quantizer_cmd, err_prefix="circle_quantizer", logfile=f)
+
+
 def main():
     # parse arguments
     parser = _get_parser()
diff --git a/compiler/one-cmds/tests/one-import_neg_002.test b/compiler/one-cmds/tests/one-import_neg_002.test
index 738c2cba9..9cf0b1401 100644
--- a/compiler/one-cmds/tests/one-import_neg_002.test
+++ b/compiler/one-cmds/tests/one-import_neg_002.test
@@ -21,10 +21,16 @@ filename="${filename_ext%.*}"
 
 trap_err_onexit()
 {
+  # TF2.3.0
   if grep -q "is incompatible with result type" "${filename}.log"; then
     echo "${filename_ext} SUCCESS"
     exit 0
   fi
+  # TF2.6.0
+  if grep -q "is incompatible with body result type" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
 
   echo "${filename_ext} FAILED"
   exit 255
diff --git a/compiler/one-cmds/tests/one-import_neg_006.test b/compiler/one-cmds/tests/one-import_neg_006.test
index 7c63ee3e4..3fb5c7df1 100644
--- a/compiler/one-cmds/tests/one-import_neg_006.test
+++ b/compiler/one-cmds/tests/one-import_neg_006.test
@@ -45,5 +45,8 @@ one-import tf \
 --input_arrays input --input_shapes "0,299,299,3" \
 --output_arrays InceptionV3/Predictions/Reshape_1 > ${filename}.log 2>&1
 
-echo "${filename_ext} FAILED"
-exit 255
+# NOTE TF2.3.0 fails(which is expected) but doesn't for TF2.5(4?) and above
+# https://github.com/tensorflow/tensorflow/issues/51756 for details
+# TODO exit 255
+echo "${filename_ext} SKIPPED"
+exit 0
diff --git a/compiler/one-cmds/tests/one-quantize_005.test b/compiler/one-cmds/tests/one-quantize_005.test
new file mode 100644
index 000000000..8449df6ae
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_005.test
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.mat.q8.circle"
+outputfile="./inception_v3.one-quantize_005.q8.circle"
+
+rm -rf ${outputfile}
+
+# run test with force_quantparam option
+one-quantize \
+--force_quantparam \
+--tensor_name input \
+--scale 2.3 \
+--zero_point 33 \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_006.test b/compiler/one-cmds/tests/one-quantize_006.test
new file mode 100644
index 000000000..92b9ebebb
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_006.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.mat.q8.circle"
+outputfile="./inception_v3.one-quantize_006.q8.circle"
+
+rm -rf ${outputfile}
+
+# run test with force_quantparam option (multi tensors)
+one-quantize \
+--force_quantparam \
+--tensor_name input \
+--scale 2.3 \
+--zero_point 33 \
+--tensor_name InceptionV3/Predictions/Reshape_1 \
+--scale 2.3 \
+--zero_point 33 \
+--input_path ${inputfile} \
+--output_path ${outputfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-quantize_neg_018.test b/compiler/one-cmds/tests/one-quantize_neg_018.test
new file mode 100644
index 000000000..6937caf4d
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_neg_018.test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with invalid min_percentile
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "following arguments are required: --zero_point" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.mat.q8.circle"
+outputfile="./inception_v3.neg_018.q8.circle"
+
+rm -rf ${outputfile}.log
+
+# run test
+one-quantize \
+--force_quantparam \
+--tensor_name input \
+--scale 2.3 \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/onecc_022.cfg b/compiler/one-cmds/tests/onecc_022.cfg
new file mode 100644
index 000000000..9741d5173
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_022.cfg
@@ -0,0 +1,18 @@
+[onecc]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=False
+one-optimize=False
+one-quantize=True
+one-pack=False
+one-codegen=False
+one-profile=False
+
+[one-quantize]
+input_path=inception_v3.mat.q8.circle
+output_path=inception_v3.onecc_022.q8.circle
+force_quantparam=True
+tensor_name=input
+scale=2.1
+zero_point=45
diff --git a/compiler/one-cmds/tests/onecc_022.test b/compiler/one-cmds/tests/onecc_022.test
new file mode 100644
index 000000000..3aaa26fea
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_022.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-quantize
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="onecc_022.cfg"
+outputfile="inception_v3.onecc_022.q8.circle"
+
+rm -rf ${outputfile}
+
+# run test
+onecc -C ${configfile} > /dev/null 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/prepare_test_materials.sh b/compiler/one-cmds/tests/prepare_test_materials.sh
index 694651d74..7f269530c 100644
--- a/compiler/one-cmds/tests/prepare_test_materials.sh
+++ b/compiler/one-cmds/tests/prepare_test_materials.sh
@@ -103,4 +103,14 @@ if [[ ! -s ${outputfile} ]]; then
   --output_arrays InceptionV3/Predictions/Reshape_1
 fi
 
+# prepare 'inception_v3.mat.q8.circle' file used for quantization test
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.mat.q8.circle"
+
+if [[ ! -s ${outputfile} ]]; then
+  ../bin/one-quantize \
+  --input_path ${inputfile} \
+  --output_path ${outputfile}
+fi
+
 popd > /dev/null
diff --git a/compiler/one-cmds/utils.py b/compiler/one-cmds/utils.py
index f18dc6f56..efb01a210 100644
--- a/compiler/one-cmds/utils.py
+++ b/compiler/one-cmds/utils.py
@@ -29,6 +29,7 @@ class _CONSTANT:
         ('convert_nchw_to_nhwc',
          'Experimental: This will convert NCHW operators to NHWC under the assumption that input model is NCHW.'
          ),
+        ('expand_broadcast_const', 'expand broadcastable constant node inputs'),
         ('nchw_to_nhwc_input_shape',
          'convert the input shape of the model (argument for convert_nchw_to_nhwc)'),
         ('nchw_to_nhwc_output_shape',
@@ -36,9 +37,11 @@ class _CONSTANT:
         ('fold_add_v2', 'fold AddV2 op with constant inputs'),
         ('fold_cast', 'fold Cast op with constant input'),
         ('fold_dequantize', 'fold Dequantize op'),
+        ('fold_dwconv', 'fold Depthwise Convolution op with constant inputs'),
         ('fold_sparse_to_dense', 'fold SparseToDense op'),
         ('forward_reshape_to_unaryop', 'Forward Reshape op'),
         ('fuse_add_with_tconv', 'fuse Add op to Transposed'),
+        ('fuse_add_with_fully_connected', 'fuse Add op to FullyConnected op'),
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
@@ -74,6 +77,8 @@ class _CONSTANT:
          'convert weight format of FullyConnected op to SHUFFLED16x1FLOAT32.'
          ' Note that it only converts weights whose row is a multiple of 16'),
         ('substitute_pack_to_reshape', 'convert single input Pack op to Reshape op'),
+        ('substitute_padv2_to_pad', 'convert certain condition PadV2 to Pad'),
+        ('substitute_splitv_to_split', 'convert certain condition SplitV to Split'),
         ('substitute_squeeze_to_reshape', 'convert certain condition Squeeze to Reshape'),
         ('substitute_strided_slice_to_reshape',
          'convert certain condition StridedSlice to Reshape'),
@@ -107,6 +112,14 @@ def _add_default_arg(parser):
     parser.add_argument('-S', '--section', type=str, help=argparse.SUPPRESS)
 
 
+def is_accumulated_arg(arg, driver):
+    if driver == "one-quantize":
+        if arg == "tensor_name" or arg == "scale" or arg == "zero_point":
+            return True
+
+    return False
+
+
 def _is_valid_attr(args, attr):
     return hasattr(args, attr) and getattr(args, attr)
 
@@ -124,6 +137,12 @@ def _parse_cfg(args, driver_name):
                 raise AssertionError('configuration file must have \'' + driver_name +
                                      '\' section')
             for key in config[args.section]:
+                if is_accumulated_arg(key, driver_name):
+                    if not _is_valid_attr(args, key):
+                        setattr(args, key, [config[args.section][key]])
+                    else:
+                        getattr(args, key).append(config[args.section][key])
+                    continue
                 if not _is_valid_attr(args, key):
                     setattr(args, key, config[args.section][key])
         # if section is not given, section name is same with its driver name
@@ -133,6 +152,12 @@ def _parse_cfg(args, driver_name):
                                      '\' section')
             secton_to_run = driver_name
             for key in config[secton_to_run]:
+                if is_accumulated_arg(key, driver_name):
+                    if not _is_valid_attr(args, key):
+                        setattr(args, key, [config[secton_to_run][key]])
+                    else:
+                        getattr(args, key).append(config[secton_to_run][key])
+                    continue
                 if not _is_valid_attr(args, key):
                     setattr(args, key, config[secton_to_run][key])
 
@@ -242,33 +267,26 @@ def _run(cmd, err_prefix=None, logfile=None):
         err_prefix: prefix to be put before every stderr lines
         logfile: file stream to which both of stdout and stderr lines will be written
     """
-    if logfile == None:
-        with subprocess.Popen(cmd, stderr=subprocess.PIPE, bufsize=1) as p:
-            for line in p.stderr:
-                if err_prefix:
-                    line = f"{err_prefix}: ".encode() + line
-                sys.stderr.buffer.write(line)
-                sys.stderr.buffer.flush()
-    else:
-        with subprocess.Popen(
-                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) as p:
-            import select
-            inputs = set([p.stdout, p.stderr])
-            while inputs:
-                readable, _, _ = select.select(inputs, [], [])
-                for x in readable:
-                    line = x.readline()
-                    if len(line) == 0:
-                        inputs.discard(x)
-                        continue
-                    if x == p.stdout:
-                        out = sys.stdout
-                    if x == p.stderr:
-                        out = sys.stderr
-                        if err_prefix:
-                            line = f"{err_prefix}: ".encode() + line
-                    out.buffer.write(line)
-                    out.buffer.flush()
+    with subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) as p:
+        import select
+        inputs = set([p.stdout, p.stderr])
+        while inputs:
+            readable, _, _ = select.select(inputs, [], [])
+            for x in readable:
+                line = x.readline()
+                if len(line) == 0:
+                    inputs.discard(x)
+                    continue
+                if x == p.stdout:
+                    out = sys.stdout
+                if x == p.stderr:
+                    out = sys.stderr
+                    if err_prefix:
+                        line = f"{err_prefix}: ".encode() + line
+                out.buffer.write(line)
+                out.buffer.flush()
+                if logfile != None:
                     logfile.write(line)
     if p.returncode != 0:
         sys.exit(p.returncode)
diff --git a/compiler/pota-quantization-value-test/CMakeLists.txt b/compiler/pota-quantization-value-test/CMakeLists.txt
index 80661e566..00ffb57de 100644
--- a/compiler/pota-quantization-value-test/CMakeLists.txt
+++ b/compiler/pota-quantization-value-test/CMakeLists.txt
@@ -1,7 +1,7 @@
 unset(QUANTIZATION_VALUE_TEST)
 unset(QUANTIZATION_VALUE_TEST_WITH_PARAM)
 
-nnas_find_package(FlatBuffers QUIET)
+nnas_find_package(FlatBuffers EXACT 1.10 QUIET)
 if(NOT FlatBuffers_FOUND)
   message(STATUS "Build pota-quantization-value-test: FAILED (missing FlatBuffers)")
   return()
@@ -25,7 +25,7 @@ get_target_property(SCHEMA_BIN_PATH mio_circle BINARY_DIR)
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/gen_h5_explicit_inputs.py"
                "${CMAKE_CURRENT_BINARY_DIR}/gen_h5_explicit_inputs.py" COPYONLY)
 
-set(VIRTUALENV "${NNCC_OVERLAY_DIR}/venv_2_3_0")
+set(VIRTUALENV "${NNCC_OVERLAY_DIR}/venv_2_6_0")
 
 ###
 ### Generate test.config
diff --git a/compiler/pota-quantization-value-test/requires.cmake b/compiler/pota-quantization-value-test/requires.cmake
index 883a925df..4eb7204e1 100644
--- a/compiler/pota-quantization-value-test/requires.cmake
+++ b/compiler/pota-quantization-value-test/requires.cmake
@@ -2,3 +2,4 @@ require("record-minmax")
 require("circle-quantizer")
 require("circle-tensordump")
 require("common-artifacts")
+require("mio-circle")
diff --git a/compiler/tfl-inspect/CMakeLists.txt b/compiler/tfl-inspect/CMakeLists.txt
index ba019865f..6ba55c357 100644
--- a/compiler/tfl-inspect/CMakeLists.txt
+++ b/compiler/tfl-inspect/CMakeLists.txt
@@ -10,5 +10,5 @@ add_executable(tfl-inspect ${DRIVER} ${SOURCES})
 target_include_directories(tfl-inspect PRIVATE src)
 target_link_libraries(tfl-inspect arser)
 target_link_libraries(tfl-inspect foder)
-target_link_libraries(tfl-inspect mio_tflite)
+target_link_libraries(tfl-inspect mio_tflite260)
 target_link_libraries(tfl-inspect safemain)
diff --git a/compiler/tfl-inspect/requires.cmake b/compiler/tfl-inspect/requires.cmake
index 25857ad2b..9a7477b81 100644
--- a/compiler/tfl-inspect/requires.cmake
+++ b/compiler/tfl-inspect/requires.cmake
@@ -1,4 +1,4 @@
 require("arser")
 require("foder")
-require("mio-tflite")
+require("mio-tflite260")
 require("safemain")
diff --git a/compiler/tfl-inspect/src/Reader.cpp b/compiler/tfl-inspect/src/Reader.cpp
index 5be289446..41a8396bb 100644
--- a/compiler/tfl-inspect/src/Reader.cpp
+++ b/compiler/tfl-inspect/src/Reader.cpp
@@ -16,21 +16,34 @@
 
 #include "Reader.h"
 
+#include <cassert>
 #include <sstream>
 #include <string>
 
 namespace tflinspect
 {
 
+// This will provide v3/v3a format neutral BuiltinOperator
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode)
+{
+  assert(opcode != nullptr);
+  int8_t dp_code = opcode->deprecated_builtin_code();
+  // 127 is max of int8_t which is upper bound of v3 builtin_code
+  // NOTE TensorFlow uses 'BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES' for 127
+  if (dp_code < 127 && dp_code >= 0)
+    return tflite::BuiltinOperator(dp_code);
+  return opcode->builtin_code();
+}
+
 bool is_valid(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (tflite::BuiltinOperator_MIN <= code && code <= tflite::BuiltinOperator_MAX);
 }
 
 bool is_custom(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (code == tflite::BuiltinOperator_CUSTOM);
 }
 
@@ -56,7 +69,7 @@ std::string opcode_name(const tflite::OperatorCode *opcode)
     return custom_op;
   }
 
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return tflite::EnumNameBuiltinOperator(code);
 }
 
@@ -122,7 +135,7 @@ tflite::BuiltinOperator Reader::builtin_code(const tflite::Operator *op) const
   assert(index < _op_codes.size());
   const tflite::OperatorCode *opcode = _op_codes.at(index);
 
-  return opcode->builtin_code();
+  return tflinspect::builtin_code_neutral(opcode);
 }
 
 std::string Reader::opcode_name(const tflite::Operator *op) const
diff --git a/compiler/tfl-inspect/src/Reader.h b/compiler/tfl-inspect/src/Reader.h
index e9e182a4b..91b7bb940 100644
--- a/compiler/tfl-inspect/src/Reader.h
+++ b/compiler/tfl-inspect/src/Reader.h
@@ -36,6 +36,7 @@ template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T
   return ret;
 }
 
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode);
 bool is_valid(const tflite::OperatorCode *opcode);
 bool is_custom(const tflite::OperatorCode *opcode);
 std::string opcode_name(const tflite::OperatorCode *opcode);
diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
index 4421a4660..a87d30c5e 100644
--- a/compiler/tfl-verify/CMakeLists.txt
+++ b/compiler/tfl-verify/CMakeLists.txt
@@ -8,6 +8,6 @@ add_executable(tfl-verify ${SOURCES})
 target_include_directories(tfl-verify PRIVATE src)
 target_link_libraries(tfl-verify arser)
 target_link_libraries(tfl-verify foder)
-target_link_libraries(tfl-verify mio_tflite)
+target_link_libraries(tfl-verify mio_tflite260)
 target_link_libraries(tfl-verify safemain)
 target_link_libraries(tfl-verify cwrap)
diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
index 79503f325..72803d890 100644
--- a/compiler/tfl-verify/requires.cmake
+++ b/compiler/tfl-verify/requires.cmake
@@ -1,5 +1,5 @@
 require("arser")
 require("foder")
-require("mio-tflite")
+require("mio-tflite260")
 require("safemain")
 require("cwrap")
diff --git a/compiler/tflchef/CMakeLists.txt b/compiler/tflchef/CMakeLists.txt
index ebc873342..ac7fe4b7c 100644
--- a/compiler/tflchef/CMakeLists.txt
+++ b/compiler/tflchef/CMakeLists.txt
@@ -5,10 +5,10 @@ if(NOT Protobuf_FOUND)
   return()
 endif(NOT Protobuf_FOUND)
 
-if(NOT TARGET mio_tflite)
-  message(STATUS "Build tflchef: FAILED (missing mio_tflite)")
+if(NOT TARGET mio_tflite260)
+  message(STATUS "Build tflchef: FAILED (missing mio_tflite260)")
   return()
-endif(NOT TARGET mio_tflite)
+endif(NOT TARGET mio_tflite260)
 
 # Recipe Parser
 add_subdirectory(proto)
diff --git a/compiler/tflchef/core/CMakeLists.txt b/compiler/tflchef/core/CMakeLists.txt
index 43f6b8b03..413b78b15 100644
--- a/compiler/tflchef/core/CMakeLists.txt
+++ b/compiler/tflchef/core/CMakeLists.txt
@@ -5,5 +5,5 @@ target_include_directories(tflchef_core PUBLIC include)
 target_include_directories(tflchef_core PRIVATE src)
 target_link_libraries(tflchef_core tflchef_proto)
 target_link_libraries(tflchef_core tflchef_log)
-target_link_libraries(tflchef_core mio_tflite)
+target_link_libraries(tflchef_core mio_tflite260)
 target_link_libraries(tflchef_core souschef)
diff --git a/compiler/tflchef/core/src/CustomOp/AddV2.cpp b/compiler/tflchef/core/src/CustomOp/AddV2.cpp
index dffd336cd..557c20bce 100644
--- a/compiler/tflchef/core/src/CustomOp/AddV2.cpp
+++ b/compiler/tflchef/core/src/CustomOp/AddV2.cpp
@@ -17,7 +17,7 @@
 
 #include "AddV2.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> AddV2Chef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/All.cpp b/compiler/tflchef/core/src/CustomOp/All.cpp
index b3ae821a4..bbef5ecaa 100644
--- a/compiler/tflchef/core/src/CustomOp/All.cpp
+++ b/compiler/tflchef/core/src/CustomOp/All.cpp
@@ -17,7 +17,7 @@
 
 #include "All.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> AllChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/BatchMatMulV2.cpp b/compiler/tflchef/core/src/CustomOp/BatchMatMulV2.cpp
index 595f3b9bb..6d2c5b13b 100644
--- a/compiler/tflchef/core/src/CustomOp/BatchMatMulV2.cpp
+++ b/compiler/tflchef/core/src/CustomOp/BatchMatMulV2.cpp
@@ -17,7 +17,7 @@
 
 #include "BatchMatMulV2.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> BatchMatMulV2Chef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp b/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp
index fc429e2f7..dd458b376 100644
--- a/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp
+++ b/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp
@@ -17,7 +17,7 @@
 
 #include "BroadcastTo.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> BroadcastToChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/MatMul.cpp b/compiler/tflchef/core/src/CustomOp/MatMul.cpp
index ba34aa8db..e7c707d37 100644
--- a/compiler/tflchef/core/src/CustomOp/MatMul.cpp
+++ b/compiler/tflchef/core/src/CustomOp/MatMul.cpp
@@ -17,7 +17,7 @@
 
 #include "MatMul.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> MatMulChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/MatrixBandPart.cpp b/compiler/tflchef/core/src/CustomOp/MatrixBandPart.cpp
index d12597edb..b25003227 100644
--- a/compiler/tflchef/core/src/CustomOp/MatrixBandPart.cpp
+++ b/compiler/tflchef/core/src/CustomOp/MatrixBandPart.cpp
@@ -17,7 +17,7 @@
 
 #include "MatrixBandPart.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> MatrixBandPartChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgmax.cpp b/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgmax.cpp
index 9dacf7bf6..290d3c2ca 100644
--- a/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgmax.cpp
+++ b/compiler/tflchef/core/src/CustomOp/MaxPoolWithArgmax.cpp
@@ -17,7 +17,7 @@
 
 #include "MaxPoolWithArgmax.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include <flatbuffers/flexbuffers.h>
 
 flatbuffers::Offset<void> MaxPoolWithArgmaxChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 {
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index aba20dcbf..7028bd9ac 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -582,8 +582,11 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
   for (auto const &opcode : builtin_code_map)
   {
     tflite::OperatorCodeBuilder code_builder{*flatbuffer_builder};
-    code_builder.add_builtin_code(opcode.first);
+    // TODO support for opcode.first >= 127
+    assert(opcode.first < 127);
+    code_builder.add_deprecated_builtin_code(opcode.first);
     code_builder.add_version(opcode.second);
+    code_builder.add_builtin_code(opcode.first);
     auto code = code_builder.Finish();
     // Update OperatorCode vector
     code_vec.emplace_back(code);
@@ -597,8 +600,9 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
   {
     auto custom_code = flatbuffer_builder->CreateString(opcode);
     tflite::OperatorCodeBuilder code_builder{*flatbuffer_builder};
-    code_builder.add_builtin_code(tflite::BuiltinOperator_CUSTOM);
+    code_builder.add_deprecated_builtin_code(tflite::BuiltinOperator_CUSTOM);
     code_builder.add_custom_code(custom_code);
+    code_builder.add_builtin_code(tflite::BuiltinOperator_CUSTOM);
     auto code = code_builder.Finish();
     // Update OperatorCode vector
     code_vec.emplace_back(code);
diff --git a/compiler/tflchef/requires.cmake b/compiler/tflchef/requires.cmake
index 4c02174b5..78bfa2d07 100644
--- a/compiler/tflchef/requires.cmake
+++ b/compiler/tflchef/requires.cmake
@@ -1,7 +1,7 @@
 require("arser")
 require("nnkit")
 require("cwrap")
-require("mio-tflite")
+require("mio-tflite260")
 require("safemain")
 require("hermes")
 require("hermes-std")
diff --git a/compiler/tflchef/tflite/CMakeLists.txt b/compiler/tflchef/tflite/CMakeLists.txt
index ce8b8c463..3c4c3fff6 100644
--- a/compiler/tflchef/tflite/CMakeLists.txt
+++ b/compiler/tflchef/tflite/CMakeLists.txt
@@ -4,6 +4,6 @@ add_library(tflchef_tflite STATIC ${SOURCES})
 target_include_directories(tflchef_tflite PUBLIC include)
 target_include_directories(tflchef_tflite PRIVATE src)
 target_link_libraries(tflchef_tflite tflchef_proto)
-target_link_libraries(tflchef_tflite mio_tflite)
+target_link_libraries(tflchef_tflite mio_tflite260)
 target_link_libraries(tflchef_tflite cwrap)
 target_link_libraries(tflchef_tflite souschef)
diff --git a/compiler/tflchef/tflite/src/TFliteImport.cpp b/compiler/tflchef/tflite/src/TFliteImport.cpp
index 51d9b5ffa..1462ee7f4 100644
--- a/compiler/tflchef/tflite/src/TFliteImport.cpp
+++ b/compiler/tflchef/tflite/src/TFliteImport.cpp
@@ -38,15 +38,27 @@ const char *tensor_name(const tflite::Tensor *tensor)
   return kEmptyTensorName;
 }
 
+// This will provide v3/v3a format neutral BuiltinOperator
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode)
+{
+  assert(opcode != nullptr);
+  int8_t dp_code = opcode->deprecated_builtin_code();
+  // 127 is max of int8_t which is upper bound of v3 builtin_code
+  // NOTE TensorFlow uses 'BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES' for 127
+  if (dp_code < 127 && dp_code >= 0)
+    return tflite::BuiltinOperator(dp_code);
+  return opcode->builtin_code();
+}
+
 bool is_valid(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (tflite::BuiltinOperator_MIN <= code && code <= tflite::BuiltinOperator_MAX);
 }
 
 bool is_custom(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (code == tflite::BuiltinOperator_CUSTOM);
 }
 
@@ -92,7 +104,7 @@ tflite::BuiltinOperator TFliteImport::builtin_code(const tflite::Operator *op) c
   assert(index < _op_codes.size());
   const tflite::OperatorCode *opcode = _op_codes.at(index);
 
-  return opcode->builtin_code();
+  return builtin_code_neutral(opcode);
 }
 
 std::string TFliteImport::opcode_name(const tflite::Operator *op) const
@@ -116,7 +128,7 @@ std::string TFliteImport::opcode_name(const tflite::Operator *op) const
     return opcode->custom_code()->c_str();
   }
 
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return EnumNameBuiltinOperator(code);
 }
 
diff --git a/compiler/tflchef/tflite/src/TFliteImport.h b/compiler/tflchef/tflite/src/TFliteImport.h
index 9d0a642ab..43b5bbaff 100644
--- a/compiler/tflchef/tflite/src/TFliteImport.h
+++ b/compiler/tflchef/tflite/src/TFliteImport.h
@@ -36,6 +36,7 @@ using TFliteOperators_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Operat
 
 const char *tensor_type(const tflite::Tensor *tensor);
 const char *tensor_name(const tflite::Tensor *tensor);
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode);
 bool is_valid(const tflite::OperatorCode *opcode);
 bool is_custom(const tflite::OperatorCode *opcode);
 
diff --git a/compiler/tfldump/CMakeLists.txt b/compiler/tfldump/CMakeLists.txt
index e6afcb6d2..83f7febad 100644
--- a/compiler/tfldump/CMakeLists.txt
+++ b/compiler/tfldump/CMakeLists.txt
@@ -1,7 +1,7 @@
-if(NOT TARGET mio_tflite)
-  message(STATUS "Build tfldump: FAILED (missing mio_tflite)")
+if(NOT TARGET mio_tflite260)
+  message(STATUS "Build tfldump: FAILED (missing mio_tflite260)")
   return()
-endif(NOT TARGET mio_tflite)
+endif(NOT TARGET mio_tflite260)
 
 set(DRIVER "driver/Driver.cpp")
 
@@ -10,6 +10,6 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_executable(tfldump ${DRIVER} ${SOURCES})
 target_include_directories(tfldump PRIVATE include)
 target_link_libraries(tfldump arser)
-target_link_libraries(tfldump mio_tflite)
+target_link_libraries(tfldump mio_tflite260)
 target_link_libraries(tfldump safemain)
-target_link_libraries(tfldump flatbuffers)
+target_link_libraries(tfldump flatbuffers-1.12)
diff --git a/compiler/tfldump/requires.cmake b/compiler/tfldump/requires.cmake
index 2cdd3a391..d0f9cccba 100644
--- a/compiler/tfldump/requires.cmake
+++ b/compiler/tfldump/requires.cmake
@@ -1,3 +1,3 @@
 require("arser")
-require("mio-tflite")
+require("mio-tflite260")
 require("safemain")
diff --git a/compiler/tfldump/src/Dump.cpp b/compiler/tfldump/src/Dump.cpp
index 20e1343e6..7a480bc52 100644
--- a/compiler/tfldump/src/Dump.cpp
+++ b/compiler/tfldump/src/Dump.cpp
@@ -350,6 +350,7 @@ void dump_model(std::ostream &os, const tflite::Model *model)
   auto opcodes = reader.opcodes();
   auto buffers = reader.buffers();
   auto metadata = reader.metadata();
+  auto signaturedefs = reader.signaturedefs();
 
   // dump operator_codes
   os << "Operator Codes: [order] OpCodeName (OpCode Enum)" << std::endl;
@@ -357,11 +358,13 @@ void dump_model(std::ostream &os, const tflite::Model *model)
   for (auto opcode : opcodes)
   {
     tflite::BuiltinOperator op_code = opcode->builtin_code();
+    tflite::BuiltinOperator dp_code = tflite::BuiltinOperator(opcode->deprecated_builtin_code());
+
     auto op_name = tflread::opcode_name(opcode);
     auto op_version = opcode->version();
 
     os << "[" << opcode_index << "] " << op_name << " (code: " << op_code
-       << ", version: " << op_version << ")" << std::endl;
+       << ", dep_code: " << dp_code << ", version: " << op_version << ")" << std::endl;
 
     opcode_index++;
   }
@@ -389,7 +392,38 @@ void dump_model(std::ostream &os, const tflite::Model *model)
     os << "metadata : B(index) name" << std::endl;
     for (uint32_t i = 0; i < metadata->Length(); ++i)
     {
-      os << "B(" << metadata->Get(i)->buffer() << ") " << metadata->Get(i)->name()->c_str();
+      os << "B(" << metadata->Get(i)->buffer() << ") " << metadata->Get(i)->name()->c_str()
+         << std::endl;
+    }
+    os << std::endl;
+  }
+
+  // dump signaturedef
+  if (signaturedefs != nullptr)
+  {
+    os << "SignatureDef" << std::endl;
+    for (uint32_t i = 0; i < signaturedefs->Length(); ++i)
+    {
+      auto sign_i = signaturedefs->Get(i);
+      os << "S(" << i << ") " << sign_i->method_name()->c_str() << ", key("
+         << sign_i->key()->c_str() << "), sub_graph(" << sign_i->subgraph_index() << ")"
+         << std::endl;
+
+      auto inputs_i = sign_i->inputs();
+      for (uint32_t t = 0; t < inputs_i->Length(); ++t)
+      {
+        auto inputs_i_t = inputs_i->Get(t);
+        os << "    I T(" << t << ") " << inputs_i_t->name()->c_str() << ": "
+           << inputs_i_t->tensor_index() << std::endl;
+      }
+
+      auto outputs_i = sign_i->outputs();
+      for (uint32_t t = 0; t < outputs_i->Length(); ++t)
+      {
+        auto outputs_i_t = outputs_i->Get(t);
+        os << "    O T(" << t << ") " << outputs_i_t->name()->c_str() << ": "
+           << outputs_i_t->tensor_index() << std::endl;
+      }
     }
     os << std::endl;
   }
diff --git a/compiler/tfldump/src/Read.cpp b/compiler/tfldump/src/Read.cpp
index 856cc5699..8b3a96e83 100644
--- a/compiler/tfldump/src/Read.cpp
+++ b/compiler/tfldump/src/Read.cpp
@@ -22,15 +22,25 @@
 namespace tflread
 {
 
+// This will provide v3/v3a format neutral BuiltinOperator
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode)
+{
+  assert(opcode != nullptr);
+  int8_t dp_code = opcode->deprecated_builtin_code();
+  if (dp_code < 127 && dp_code >= 0)
+    return tflite::BuiltinOperator(dp_code);
+  return opcode->builtin_code();
+}
+
 bool is_valid(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (tflite::BuiltinOperator_MIN <= code && code <= tflite::BuiltinOperator_MAX);
 }
 
 bool is_custom(const tflite::OperatorCode *opcode)
 {
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return (code == tflite::BuiltinOperator_CUSTOM);
 }
 
@@ -56,7 +66,7 @@ std::string opcode_name(const tflite::OperatorCode *opcode)
     return custom_op;
   }
 
-  tflite::BuiltinOperator code = opcode->builtin_code();
+  tflite::BuiltinOperator code = builtin_code_neutral(opcode);
   return tflite::EnumNameBuiltinOperator(code);
 }
 
@@ -82,6 +92,7 @@ Reader::Reader(const tflite::Model *model)
   _subgraphs = model->subgraphs();
   _buffers = model->buffers();
   _metadata = model->metadata();
+  _signaturedefs = model->signature_defs();
 
   auto opcodes = model->operator_codes();
   for (const ::tflite::OperatorCode *opcode : *opcodes)
@@ -118,7 +129,7 @@ tflite::BuiltinOperator Reader::builtin_code(const tflite::Operator *op) const
   assert(index < _op_codes.size());
   const tflite::OperatorCode *opcode = _op_codes.at(index);
 
-  return opcode->builtin_code();
+  return tflread::builtin_code_neutral(opcode);
 }
 
 std::string Reader::opcode_name(const tflite::Operator *op) const
diff --git a/compiler/tfldump/src/Read.h b/compiler/tfldump/src/Read.h
index f835be140..80f317d0b 100644
--- a/compiler/tfldump/src/Read.h
+++ b/compiler/tfldump/src/Read.h
@@ -36,6 +36,7 @@ template <typename T> std::vector<T> as_index_vector(const flatbuffers::Vector<T
   return ret;
 }
 
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode);
 bool is_valid(const tflite::OperatorCode *opcode);
 bool is_custom(const tflite::OperatorCode *opcode);
 std::string opcode_name(const tflite::OperatorCode *opcode);
@@ -53,6 +54,7 @@ private:
   using TFliteTensors_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>;
   using TFliteOperators_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>;
   using TFliteMetadata_t = flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>;
+  using TFliteSignatureDef_t = flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>;
 
 public:
   Reader(const tflite::Model *model);
@@ -69,6 +71,7 @@ public:
   const std::vector<int32_t> &inputs() const { return _inputs; }
   const std::vector<int32_t> &outputs() const { return _outputs; }
   const TFliteMetadata_t *metadata() const { return _metadata; }
+  const TFliteSignatureDef_t *signaturedefs() const { return _signaturedefs; }
 
   uint32_t num_subgraph() const { return _subgraphs->Length(); }
 
@@ -89,6 +92,7 @@ private:
   const TFliteTensors_t *_tensors{nullptr};
   const TFliteOperators_t *_operators{nullptr};
   const TFliteMetadata_t *_metadata{nullptr};
+  const TFliteSignatureDef_t *_signaturedefs{nullptr};
 
   uint32_t _subgraph_index;
   std::string _subgraph_name;
diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
index 3e46dd803..4ea01ad31 100644
--- a/compiler/tflite2circle/CMakeLists.txt
+++ b/compiler/tflite2circle/CMakeLists.txt
@@ -1,7 +1,7 @@
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
-list(APPEND REQUIRED_TARGETS mio_tflite)
+list(APPEND REQUIRED_TARGETS mio_tflite260)
 list(APPEND REQUIRED_TARGETS mio_circle)
 TargetRequire_Return(${REQUIRED_TARGETS})
 
@@ -11,8 +11,9 @@ add_executable(tflite2circle ${DRIVER} ${SOURCES})
 target_include_directories(tflite2circle PRIVATE include)
 target_include_directories(tflite2circle PRIVATE src)
 target_link_libraries(tflite2circle arser)
+target_link_libraries(tflite2circle foder)
 target_link_libraries(tflite2circle safemain)
-target_link_libraries(tflite2circle mio_tflite)
+target_link_libraries(tflite2circle mio_tflite260)
 target_link_libraries(tflite2circle mio_circle)
 target_link_libraries(tflite2circle vconone)
 target_link_libraries(tflite2circle nncc_coverage)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index fc7ee4042..4015631ab 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -70,9 +70,9 @@ int entry(int argc, char **argv)
   std::string circle_path = arser.get<std::string>("circle");
   // read tflite file
   tflite2circle::TFLModel tfl_model(tfl_path);
-  if (!tfl_model.is_valid())
+  if (not tfl_model.verify_data())
   {
-    std::cerr << "ERROR: Failed to load tflite '" << tfl_path << "'" << std::endl;
+    std::cerr << "ERROR: Failed to verify tflite '" << tfl_path << "'" << std::endl;
     return 255;
   }
 
@@ -80,7 +80,7 @@ int entry(int argc, char **argv)
   auto flatbuffer_builder = std::make_unique<flatbuffers::FlatBufferBuilder>(1024);
 
   // convert tflite to circle
-  tflite2circle::CircleModel circle_model{flatbuffer_builder, tfl_model};
+  tflite2circle::CircleModel circle_model{flatbuffer_builder, tfl_model.get_model()};
 
   std::ofstream outfile{circle_path, std::ios::binary};
 
diff --git a/compiler/tflite2circle/include/CircleModel.h b/compiler/tflite2circle/include/CircleModel.h
index e1e35d8ff..14c4f1c12 100644
--- a/compiler/tflite2circle/include/CircleModel.h
+++ b/compiler/tflite2circle/include/CircleModel.h
@@ -63,12 +63,17 @@ private:
 
 public:
   Offset(void) = delete;
-  Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec);
+  Offset(FlatBufBuilder &fb) : _fb{fb} {};
+
+public:
+  // TODO use _fb
+  void build(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec);
 
 public:
   CIRFlatBufVecOffset offset(void) const { return _circle_flatbuffer_vec_offset; }
 
 private:
+  FlatBufBuilder &_fb;
   CIRFlatBufVecOffset _circle_flatbuffer_vec_offset;
 };
 
@@ -79,7 +84,7 @@ private:
 
 public:
   CircleModel(void) = delete;
-  CircleModel(FlatBufBuilder &fb, TFLModel &tfl_model);
+  CircleModel(FlatBufBuilder &fb, const tflite::Model *tfl_model);
 
 public:
   void model_build(void) const;
diff --git a/compiler/tflite2circle/include/TFLModel.h b/compiler/tflite2circle/include/TFLModel.h
index e53d62749..507667bb9 100644
--- a/compiler/tflite2circle/include/TFLModel.h
+++ b/compiler/tflite2circle/include/TFLModel.h
@@ -37,15 +37,14 @@ public:
   TFLModel(const std::string &path);
 
 public:
-  bool is_valid(void) { return _valid; }
+  const tflite::Model *get_model(void);
 
-private:
-  const tflite::Model *load_model(void);
+public:
+  bool verify_data(void);
 
 private:
   std::ifstream _infile;
   DataBuffer _data;
-  bool _valid;
 
   friend class CircleModel;
 };
diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
index 837c287b6..e39f9eeaf 100644
--- a/compiler/tflite2circle/requires.cmake
+++ b/compiler/tflite2circle/requires.cmake
@@ -1,5 +1,6 @@
 require("arser")
-require("mio-tflite")
+require("foder")
+require("mio-tflite260")
 require("mio-circle")
 require("safemain")
 require("vconone")
diff --git a/compiler/tflite2circle/src/CircleModel.cpp b/compiler/tflite2circle/src/CircleModel.cpp
index ab0b5b507..4249f1560 100644
--- a/compiler/tflite2circle/src/CircleModel.cpp
+++ b/compiler/tflite2circle/src/CircleModel.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cassert>
 #include <iostream>
 #include <memory>
 
@@ -24,7 +25,8 @@ namespace tflite2circle
 {
 
 template <>
-Offset<MetaDataBufferLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
+void Offset<MetaDataBufferLink>::build(FlatBufBuilder &fb,
+                                       const TFLFlatBufVec *tflite_flatbuffer_vec)
 {
   if (tflite_flatbuffer_vec == nullptr)
     return;
@@ -34,7 +36,7 @@ Offset<MetaDataBufferLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tfli
 }
 
 template <>
-Offset<BufferLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
+void Offset<BufferLink>::build(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
 {
   std::vector<flatbuffers::Offset<circle::Buffer>> buffers_vec;
 
@@ -55,7 +57,7 @@ Offset<BufferLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatb
 }
 
 template <>
-Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
+void Offset<SubGraphLink>::build(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
 {
   std::vector<flatbuffers::Offset<circle::SubGraph>> subgprahs_vec;
 
@@ -278,8 +280,19 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
   _circle_flatbuffer_vec_offset = fb->CreateVector(subgprahs_vec);
 }
 
+tflite::BuiltinOperator builtin_code_neutral(const tflite::OperatorCode *opcode)
+{
+  assert(opcode != nullptr);
+  int8_t dp_code = opcode->deprecated_builtin_code();
+  // 127 is max of int8_t which is upper bound of v3 builtin_code
+  // NOTE TensorFlow uses 'BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES' for 127
+  if (dp_code < 127 && dp_code >= 0)
+    return tflite::BuiltinOperator(dp_code);
+  return opcode->builtin_code();
+}
+
 template <>
-Offset<OperatorCodeLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
+void Offset<OperatorCodeLink>::build(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_flatbuffer_vec)
 {
   std::vector<flatbuffers::Offset<circle::OperatorCode>> operator_code_vec;
 
@@ -287,7 +300,9 @@ Offset<OperatorCodeLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite
   {
     auto custom_code = fb->CreateString(it->custom_code());
     circle::OperatorCodeBuilder operator_code_builder{*fb};
-    operator_code_builder.add_builtin_code(get_circle_builtin_code(it->builtin_code()));
+    // TODO support circle deprecated_builtin_code
+    auto bt_code = builtin_code_neutral(it);
+    operator_code_builder.add_builtin_code(get_circle_builtin_code(bt_code));
     operator_code_builder.add_custom_code(custom_code);
     operator_code_builder.add_version(it->version());
     auto code = operator_code_builder.Finish();
@@ -296,24 +311,19 @@ Offset<OperatorCodeLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite
   _circle_flatbuffer_vec_offset = fb->CreateVector(operator_code_vec);
 }
 
-CircleModel::CircleModel(FlatBufBuilder &fb, TFLModel &model)
-  : _version{0}, _description{fb->CreateString("nnpackage")}, _fb{fb}
+CircleModel::CircleModel(FlatBufBuilder &fb, const tflite::Model *tfl_model)
+  : _version{0}, _description{fb->CreateString("ONE-tflite2circle")}, _fb{fb}
 {
-  const tflite::Model *tfl_model = model.load_model();
-  // verify flatbuffers
-  flatbuffers::Verifier verifier{reinterpret_cast<const uint8_t *>(model._data.data()),
-                                 model._data.size()};
-  if (!tflite::VerifyModelBuffer(verifier))
-  {
-    throw std::runtime_error("Failed to verify tflite");
-  }
+  _operator_codes_offset = std::make_unique<Offset<OperatorCodeLink>>(fb);
+  _subGraphs_offset = std::make_unique<Offset<SubGraphLink>>(fb);
+  _buffers_offset = std::make_unique<Offset<BufferLink>>(fb);
+  _metadata_buffer_offset = std::make_unique<Offset<MetaDataBufferLink>>(fb);
+
+  _operator_codes_offset->build(fb, tfl_model->operator_codes());
+  _subGraphs_offset->build(fb, tfl_model->subgraphs());
+  _buffers_offset->build(fb, tfl_model->buffers());
+  _metadata_buffer_offset->build(fb, tfl_model->metadata_buffer());
 
-  _operator_codes_offset =
-    std::make_unique<Offset<OperatorCodeLink>>(fb, tfl_model->operator_codes());
-  _subGraphs_offset = std::make_unique<Offset<SubGraphLink>>(fb, tfl_model->subgraphs());
-  _buffers_offset = std::make_unique<Offset<BufferLink>>(fb, tfl_model->buffers());
-  _metadata_buffer_offset =
-    std::make_unique<Offset<MetaDataBufferLink>>(fb, tfl_model->metadata_buffer());
   model_build();
 }
 
diff --git a/compiler/tflite2circle/src/TFLModel.cpp b/compiler/tflite2circle/src/TFLModel.cpp
index 33f11fb83..470b1aec7 100644
--- a/compiler/tflite2circle/src/TFLModel.cpp
+++ b/compiler/tflite2circle/src/TFLModel.cpp
@@ -16,6 +16,8 @@
 
 #include <iostream>
 
+#include <foder/FileLoader.h>
+
 #include "TFLModel.h"
 
 namespace tflite2circle
@@ -23,21 +25,21 @@ namespace tflite2circle
 
 TFLModel::TFLModel(const std::string &path)
 {
-  _infile.open(path, std::ios::binary | std::ios::in);
-  _valid = _infile.good();
+  foder::FileLoader file_loader{path};
+  _data = file_loader.load();
 }
 
-const tflite::Model *TFLModel::load_model(void)
+bool TFLModel::verify_data(void)
 {
-  assert(_valid == true);
-  _infile.seekg(0, std::ios::end);
-  auto fileSize = _infile.tellg();
-  _infile.seekg(0, std::ios::beg);
-  _data.resize(fileSize);
-  _infile.read(_data.data(), fileSize);
-  _infile.close();
-
-  return tflite::GetModel(_data.data());
+  // verify flatbuffers
+  flatbuffers::Verifier verifier{reinterpret_cast<const uint8_t *>(_data.data()), _data.size()};
+  if (not tflite::VerifyModelBuffer(verifier))
+  {
+    return false;
+  }
+  return true;
 }
 
+const tflite::Model *TFLModel::get_model(void) { return tflite::GetModel(_data.data()); }
+
 } // namespace tflite2circle
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
index 1cf7c0c45..50ee05242 100644
--- a/compiler/vconone/CMakeLists.txt
+++ b/compiler/vconone/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT VCONONE_VERSION)
-  set(VCONONE_VERSION 0x0000000100110000)
+  set(VCONONE_VERSION 0x0000000000120001)
   # NOTE order is [build patch minor major]
   # if VCONONE_VERSION is set with -D option, it will be cached
   # you may have to remove cache file if you remove -D option
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 1a180a35b..e15dc2685 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -83,10 +83,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
   {"topkv2_find_first_negative", "topkv2.cl"},
   {"topkv2_reorder_negatives", "topkv2.cl"},
   {"topkv2_store", "topkv2.cl"},
-  {"radixsort_histogram", "topkv2_radixsort.cl"},
-  {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
-  {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
-  {"radixsort_reorder", "topkv2_radixsort.cl"},
   {"topkv2_quicksort", "topkv2_quicksort.cl"},
   {"scale_factor_symm8", "scale_factor.cl"},
 };
@@ -186,10 +182,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/topkv2.clembed"
   },
   {
-    "topkv2_radixsort.cl",
-#include "./cl_kernels/topkv2_radixsort.clembed"
-  },
-  {
     "topkv2_quicksort.cl",
 #include "./cl_kernels/topkv2_quicksort.clembed"
   },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index e9d4696b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
-                                  const int pass, __local int *loc_histo, const int n)
-{
-  int it = get_local_id(0);  // i local number of the processor
-  int ig = get_global_id(0); // global number = i + g I
-
-  int gr = get_group_id(0); // g group number
-
-  int groups = get_num_groups(0);
-  int items = get_local_size(0);
-
-  // set the local histograms to zero
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    loc_histo[ir * items + it] = 0;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // range of keys that are analyzed by the work item
-  int size = n / groups / items; // size of the sub-list
-  int start = ig * size;         // beginning of the sub-list
-
-  unsigned int key;
-  int shortkey, k;
-
-  // compute the index
-  // the computation depends on the transposition
-  for (int j = 0; j < size; j++)
-  {
-#ifdef TRANSPOSE
-    k = groups * items * j + ig;
-#else
-    k = j + start;
-#endif
-
-    key = *((__global unsigned int *)(in_key_buf + k));
-
-    // extract the group of _BITS bits of the pass
-    // the result is in the range 0.._RADIX-1
-    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
-    // increment the local histogram
-    loc_histo[shortkey * items + it]++;
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // copy the local histogram to the global one
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
-  }
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
-                        const int nbrow, const __global int *inperm, __global int *outperm,
-                        __local int *blockmat, __local int *blockperm, const int tilesize)
-{
-
-  int i0 = get_global_id(0) * tilesize; // first row index
-  int j = get_global_id(1);             // column index
-
-  int jloc = get_local_id(1); // local column index
-
-  // fill the cache
-  for (int iloc = 0; iloc < tilesize; iloc++)
-  {
-    int k = (i0 + iloc) * nbcol + j; // position in the matrix
-    blockmat[iloc * tilesize + jloc] = invect[k];
-#ifdef PERMUT
-    blockperm[iloc * tilesize + jloc] = inperm[k];
-#endif
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // first row index in the transpose
-  int j0 = get_group_id(1) * tilesize;
-
-  // put the cache at the good place
-  for (int iloc = 0; iloc < tilesize; iloc++)
-  {
-    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
-    outvect[kt] = blockmat[jloc * tilesize + iloc];
-#ifdef PERMUT
-    outperm[kt] = blockperm[jloc * tilesize + iloc];
-#endif
-  }
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
-                                __global int *d_Histograms, const int pass,
-                                __global int *indices_in, __global int *indices_out,
-                                __local int *loc_histo, const int n)
-{
-
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-
-  int gr = get_group_id(0);
-  int groups = get_num_groups(0);
-  int items = get_local_size(0);
-
-  int start = ig * (n / groups / items);
-  int size = n / groups / items;
-
-  // take the histogram in the cache
-  for (int ir = 0; ir < _RADIX; ir++)
-  {
-    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  int newpos, shortkey, k, newpost;
-  unsigned int key;
-
-  for (int j = 0; j < size; j++)
-  {
-#ifdef TRANSPOSE
-    k = groups * items * j + ig;
-#else
-    k = j + start;
-#endif
-    float org_value = in_key[k];
-    key = *(__global unsigned int *)(in_key + k);
-    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
-    newpos = loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
-    int ignew, jnew;
-    ignew = newpos / (n / groups / items);
-    jnew = newpos % (n / groups / items);
-    newpost = jnew * (groups * items) + ignew;
-#else
-    newpost = newpos;
-#endif
-
-    // d_outKeys[newpost]= key;  // killing line !!!
-    out_key[newpost] = org_value;
-
-#ifdef PERMUT
-    indices_out[newpost] = indices_in[k];
-#endif
-
-    newpos++;
-    loc_histo[shortkey * items + it] = newpos;
-  }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
-                                       __global int *globsum)
-{
-  int it = get_local_id(0);
-  int ig = get_global_id(0);
-  int decale = 1;
-  int n = get_local_size(0) * 2;
-  int gr = get_group_id(0);
-
-  // load input into local memory
-  // up sweep phase
-  temp[2 * it] = histo[2 * ig];
-  temp[2 * it + 1] = histo[2 * ig + 1];
-
-  // parallel prefix sum (algorithm of Blelloch 1990)
-  for (int d = n >> 1; d > 0; d >>= 1)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (it < d)
-    {
-      int ai = decale * (2 * it + 1) - 1;
-      int bi = decale * (2 * it + 2) - 1;
-      temp[bi] += temp[ai];
-    }
-    decale *= 2;
-  }
-
-  // store the last element in the global sum vector
-  // (maybe used in the next step for constructing the global scan)
-  // clear the last element
-  if (it == 0)
-  {
-    globsum[gr] = temp[n - 1];
-    temp[n - 1] = 0;
-  }
-
-  // down sweep phase
-  for (int d = 1; d < n; d *= 2)
-  {
-    decale >>= 1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (it < d)
-    {
-      int ai = decale * (2 * it + 1) - 1;
-      int bi = decale * (2 * it + 2) - 1;
-
-      int t = temp[ai];
-      temp[ai] = temp[bi];
-      temp[bi] += t;
-    }
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // write results to device memory
-
-  histo[2 * ig] = temp[2 * it];
-  histo[2 * ig + 1] = temp[2 * it + 1];
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
-{
-  int ig = get_global_id(0);
-  int gr = get_group_id(0);
-
-  int s;
-
-  s = globsum[gr];
-
-  // write results to device memory
-  histo[2 * ig] += s;
-  histo[2 * ig + 1] += s;
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-}
diff --git a/docs/conf.py b/docs/conf.py
index ea17db054..b59cab878 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@ copyright = '2020, Samsung Research & contributors'
 author = 'Samsung Research & contributors'
 
 # The full version, including alpha/beta/rc tags
-release = '1.17.0'
+release = '1.18.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/release/1.18/index.rst b/docs/release/1.18/index.rst
new file mode 100644
index 000000000..71c46585a
--- /dev/null
+++ b/docs/release/1.18/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Fri Oct 20 15:20:12 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.18
+====
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.18.0.md
diff --git a/docs/release/1.18/release-note-1.18.0.md b/docs/release/1.18/release-note-1.18.0.md
new file mode 100644
index 000000000..a10f10e37
--- /dev/null
+++ b/docs/release/1.18/release-note-1.18.0.md
@@ -0,0 +1,11 @@
+# Release Note 1.18.0
+
+## ONE Compiler
+
+### Compiler Frontend
+
+- More optimization pass
+  - Fold DepthwiseConv2D
+  - Substitute SplitV to Split
+  - Expand BroadCast Const
+  - Force QuantParam
diff --git a/infra/cmake/modules/ExternalSourceTools.cmake b/infra/cmake/modules/ExternalSourceTools.cmake
index 0bfbaa33b..c8ca57520 100644
--- a/infra/cmake/modules/ExternalSourceTools.cmake
+++ b/infra/cmake/modules/ExternalSourceTools.cmake
@@ -103,7 +103,13 @@ function(ExternalSource_Download PREFIX)
 
     message(STATUS "Extract ${PREFIX}")
     execute_process(COMMAND ${CMAKE_COMMAND} -E tar xfz "${DOWNLOAD_PATH}"
-                    WORKING_DIRECTORY "${TMP_DIR}")
+                    WORKING_DIRECTORY "${TMP_DIR}"
+                    ERROR_VARIABLE EXTRACTION_ERROR)
+
+    if(EXTRACTION_ERROR)
+      message(FATAL_ERROR "Extract ${PREFIX} - failed")
+    endif(EXTRACTION_ERROR)
+
     file(REMOVE "${DOWNLOAD_PATH}")
     message(STATUS "Extract ${PREFIX} - done")
 
diff --git a/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfig.cmake b/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfig.cmake
new file mode 100644
index 000000000..b48239f2a
--- /dev/null
+++ b/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfig.cmake
@@ -0,0 +1,13 @@
+function(_CMSISSource_import)
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(CMSIS_5_8_0_URL https://github.com/ARM-software/CMSIS_5/archive/refs/tags/5.8.0.tar.gz)
+
+  ExternalSource_Download(CMSIS DIRNAME CMSIS-5.8.0 ${CMSIS_5_8_0_URL})
+
+  set(CMSISSource_DIR ${CMSIS_SOURCE_DIR} PARENT_SCOPE)
+  set(CMSISSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_CMSISSource_import)
+
+_CMSISSource_import()
diff --git a/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfigVersion.cmake b/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfigVersion.cmake
new file mode 100644
index 000000000..ca6f7826d
--- /dev/null
+++ b/infra/cmake/packages/CMSISSource-5.8.0/CMSISSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "5.8.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfig.cmake
new file mode 100644
index 000000000..0eb8eb91c
--- /dev/null
+++ b/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfig.cmake
@@ -0,0 +1,118 @@
+function(_FlatBuffers_import)
+  find_package(Flatbuffers QUIET)
+  set(FlatBuffers_FOUND ${Flatbuffers_FOUND} PARENT_SCOPE)
+endfunction(_FlatBuffers_import)
+
+function(_FlatBuffers_build)
+  if(NOT BUILD_FLATBUFFERS)
+    message(STATUS "FlatBuffersConfig skip: BUILD_FLATBUFFERS OFF")
+    return()
+  endif(NOT BUILD_FLATBUFFERS)
+
+  nnas_find_package(FlatBuffersSource EXACT 1.10 QUIET)
+
+  if(NOT FlatBuffersSource_FOUND)
+    # Source is not available
+    message(STATUS "FlatBuffersConfig skip: FlatBuffersSource not found")
+    return()
+  endif(NOT FlatBuffersSource_FOUND)
+
+  set(ADDITIONAL_CXX_FLAGS "")
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+    set(ADDITIONAL_CXX_FLAGS "-Wno-error=class-memaccess")
+  endif()
+
+  nnas_include(ExternalBuildTools)
+  ExternalBuild_CMake(CMAKE_DIR   ${FlatBuffersSource_DIR}
+                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS-1.10/build
+                      INSTALL_DIR ${EXT_OVERLAY_DIR}/FLATBUFFERS-1.10
+                      BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
+                      IDENTIFIER  "1.10-fix4"
+                      EXTRA_OPTS  "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF -DPOSITION_INDEPENDENT_CODE:BOOL=ON"
+                      PKG_NAME    "FLATBUFFERS-1.10")
+
+endfunction(_FlatBuffers_build)
+
+_FlatBuffers_build()
+_FlatBuffers_import()
+
+if(FlatBuffers_FOUND)
+  if(NOT TARGET flatbuffers-1.10)
+    add_library(flatbuffers-1.10 INTERFACE)
+    target_link_libraries(flatbuffers-1.10 INTERFACE flatbuffers::flatbuffers)
+    message(STATUS "Found FlatBuffers-1.10: TRUE")
+  endif(NOT TARGET flatbuffers-1.10)
+
+  function(FlatBuffers_Generate PREFIX OUTPUT_DIR SCHEMA_DIR)
+    get_filename_component(abs_output_dir ${OUTPUT_DIR} ABSOLUTE)
+    get_filename_component(abs_schema_dir ${SCHEMA_DIR} ABSOLUTE)
+
+    foreach(schema ${ARGN})
+      get_filename_component(schema_fn "${schema}" NAME)
+      get_filename_component(dir "${schema}" DIRECTORY)
+
+      get_filename_component(schema_fn_we "${schema_fn}" NAME_WE)
+
+      list(APPEND SCHEMA_FILES "${abs_schema_dir}/${schema}")
+      list(APPEND OUTPUT_FILES "${abs_output_dir}/${schema_fn_we}_generated.h")
+    endforeach()
+
+    add_custom_command(OUTPUT ${OUTPUT_FILES}
+                       COMMAND ${CMAKE_COMMAND} -E make_directory "${abs_output_dir}"
+                       COMMAND "$<TARGET_FILE:flatbuffers::flatc>" -c --no-includes
+                       --no-union-value-namespacing
+                       --gen-object-api -o "${abs_output_dir}"
+                       ${SCHEMA_FILES}
+                       DEPENDS flatbuffers::flatc)
+
+    set(${PREFIX}_SOURCES ${OUTPUT_FILES} PARENT_SCOPE)
+    set(${PREFIX}_INCLUDE_DIRS ${abs_output_dir} PARENT_SCOPE)
+  endfunction(FlatBuffers_Generate)
+
+  function(FlatBuffers_Target TGT)
+    set(oneValueArgs OUTPUT_DIR SCHEMA_DIR INCLUDE_DIR)
+    set(multiValueArgs SCHEMA_FILES)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # Use OUTPUT_DIR as INCLUDE_DIR if INCLUDE_DIR is not specified
+    if(NOT ARG_INCLUDE_DIR)
+      set(ARG_INCLUDE_DIR ${ARG_OUTPUT_DIR})
+    endif(NOT ARG_INCLUDE_DIR)
+
+    get_filename_component(abs_output_dir ${ARG_OUTPUT_DIR} ABSOLUTE)
+    get_filename_component(abs_include_dir ${ARG_INCLUDE_DIR} ABSOLUTE)
+    get_filename_component(abs_schema_dir ${ARG_SCHEMA_DIR} ABSOLUTE)
+
+    # Let's reset list variables before using them
+    # NOTE THIS DOES NOT AFFECT parent scope
+    unset(SCHEMA_FILES)
+    unset(OUTPUT_FILES)
+
+    foreach(schema ${ARG_SCHEMA_FILES})
+      get_filename_component(schema_fn "${schema}" NAME)
+      get_filename_component(dir "${schema}" DIRECTORY)
+
+      get_filename_component(schema_fn_we "${schema_fn}" NAME_WE)
+
+      list(APPEND SCHEMA_FILES "${abs_schema_dir}/${schema}")
+      list(APPEND OUTPUT_FILES "${abs_output_dir}/${schema_fn_we}_generated.h")
+    endforeach()
+
+    # Generate headers
+    add_custom_command(OUTPUT ${OUTPUT_FILES}
+                       COMMAND ${CMAKE_COMMAND} -E make_directory "${abs_output_dir}"
+                       COMMAND "$<TARGET_FILE:flatbuffers::flatc>" -c --no-includes
+                               --no-union-value-namespacing
+                               --gen-object-api -o "${abs_output_dir}"
+                               ${SCHEMA_FILES}
+                       DEPENDS ${SCHEMA_FILES}
+                       COMMENT "Generate '${TGT}' headers")
+
+    # NOTE This header-only library is deliberately declared as STATIC library
+    #      to avoid possible scope issues related with generated files
+    add_library(${TGT} STATIC ${OUTPUT_FILES})
+    set_target_properties(${TGT} PROPERTIES LINKER_LANGUAGE CXX)
+    target_include_directories(${TGT} PUBLIC "${ARG_INCLUDE_DIR}")
+    target_link_libraries(${TGT} PUBLIC flatbuffers-1.10)
+  endfunction(FlatBuffers_Target)
+endif(FlatBuffers_FOUND)
diff --git a/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfigVersion.cmake b/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfigVersion.cmake
new file mode 100644
index 000000000..6585f21d5
--- /dev/null
+++ b/infra/cmake/packages/FlatBuffers-1.10/FlatBuffersConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "1.10")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfig.cmake
new file mode 100644
index 000000000..daa749c58
--- /dev/null
+++ b/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfig.cmake
@@ -0,0 +1,118 @@
+function(_FlatBuffers_import)
+  find_package(Flatbuffers QUIET)
+  set(FlatBuffers_FOUND ${Flatbuffers_FOUND} PARENT_SCOPE)
+endfunction(_FlatBuffers_import)
+
+function(_FlatBuffers_build)
+  if(NOT BUILD_FLATBUFFERS)
+    message(STATUS "FlatBuffersConfig !BUILD_FLATBUFFERS")
+    return()
+  endif(NOT BUILD_FLATBUFFERS)
+
+  nnas_find_package(FlatBuffersSource EXACT 1.12 QUIET)
+
+  if(NOT FlatBuffersSource_FOUND)
+    # Source is not available
+    message(STATUS "FlatBuffersConfig !FlatBuffersSource_FOUND")
+    return()
+  endif(NOT FlatBuffersSource_FOUND)
+
+  set(ADDITIONAL_CXX_FLAGS "")
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+    set(ADDITIONAL_CXX_FLAGS "-Wno-error=class-memaccess")
+  endif()
+
+  nnas_include(ExternalBuildTools)
+  ExternalBuild_CMake(CMAKE_DIR   ${FlatBuffersSource_DIR}
+                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS-1.12/build
+                      INSTALL_DIR ${EXT_OVERLAY_DIR}/FLATBUFFERS-1.12
+                      BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
+                      IDENTIFIER  "1.12-fix1"
+                      EXTRA_OPTS  "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF -DPOSITION_INDEPENDENT_CODE:BOOL=ON"
+                      PKG_NAME    "FLATBUFFERS-1.12")
+
+endfunction(_FlatBuffers_build)
+
+_FlatBuffers_build()
+_FlatBuffers_import()
+
+if(FlatBuffers_FOUND)
+  if(NOT TARGET flatbuffers-1.12)
+    add_library(flatbuffers-1.12 INTERFACE)
+    target_link_libraries(flatbuffers-1.12 INTERFACE flatbuffers::flatbuffers)
+    message(STATUS "Found FlatBuffers-1.12: TRUE")
+  endif(NOT TARGET flatbuffers-1.12)
+
+  function(FlatBuffers_Generate PREFIX OUTPUT_DIR SCHEMA_DIR)
+    get_filename_component(abs_output_dir ${OUTPUT_DIR} ABSOLUTE)
+    get_filename_component(abs_schema_dir ${SCHEMA_DIR} ABSOLUTE)
+
+    foreach(schema ${ARGN})
+      get_filename_component(schema_fn "${schema}" NAME)
+      get_filename_component(dir "${schema}" DIRECTORY)
+
+      get_filename_component(schema_fn_we "${schema_fn}" NAME_WE)
+
+      list(APPEND SCHEMA_FILES "${abs_schema_dir}/${schema}")
+      list(APPEND OUTPUT_FILES "${abs_output_dir}/${schema_fn_we}_generated.h")
+    endforeach()
+
+    add_custom_command(OUTPUT ${OUTPUT_FILES}
+                       COMMAND ${CMAKE_COMMAND} -E make_directory "${abs_output_dir}"
+                       COMMAND "$<TARGET_FILE:flatbuffers::flatc>" -c --no-includes
+                       --no-union-value-namespacing
+                       --gen-object-api -o "${abs_output_dir}"
+                       ${SCHEMA_FILES}
+                       DEPENDS flatbuffers::flatc)
+
+    set(${PREFIX}_SOURCES ${OUTPUT_FILES} PARENT_SCOPE)
+    set(${PREFIX}_INCLUDE_DIRS ${abs_output_dir} PARENT_SCOPE)
+  endfunction(FlatBuffers_Generate)
+
+  function(FlatBuffers_Target TGT)
+    set(oneValueArgs OUTPUT_DIR SCHEMA_DIR INCLUDE_DIR)
+    set(multiValueArgs SCHEMA_FILES)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # Use OUTPUT_DIR as INCLUDE_DIR if INCLUDE_DIR is not specified
+    if(NOT ARG_INCLUDE_DIR)
+      set(ARG_INCLUDE_DIR ${ARG_OUTPUT_DIR})
+    endif(NOT ARG_INCLUDE_DIR)
+
+    get_filename_component(abs_output_dir ${ARG_OUTPUT_DIR} ABSOLUTE)
+    get_filename_component(abs_include_dir ${ARG_INCLUDE_DIR} ABSOLUTE)
+    get_filename_component(abs_schema_dir ${ARG_SCHEMA_DIR} ABSOLUTE)
+
+    # Let's reset list variables before using them
+    # NOTE THIS DOES NOT AFFECT parent scope
+    unset(SCHEMA_FILES)
+    unset(OUTPUT_FILES)
+
+    foreach(schema ${ARG_SCHEMA_FILES})
+      get_filename_component(schema_fn "${schema}" NAME)
+      get_filename_component(dir "${schema}" DIRECTORY)
+
+      get_filename_component(schema_fn_we "${schema_fn}" NAME_WE)
+
+      list(APPEND SCHEMA_FILES "${abs_schema_dir}/${schema}")
+      list(APPEND OUTPUT_FILES "${abs_output_dir}/${schema_fn_we}_generated.h")
+    endforeach()
+
+    # Generate headers
+    add_custom_command(OUTPUT ${OUTPUT_FILES}
+                       COMMAND ${CMAKE_COMMAND} -E make_directory "${abs_output_dir}"
+                       COMMAND "$<TARGET_FILE:flatbuffers::flatc>" -c --no-includes
+                               --no-union-value-namespacing
+                               --gen-object-api -o "${abs_output_dir}"
+                               ${SCHEMA_FILES}
+                       DEPENDS ${SCHEMA_FILES}
+                       COMMENT "Generate '${TGT}' headers")
+
+    # NOTE This header-only library is deliberately declared as STATIC library
+    #      to avoid possible scope issues related with generated files
+    add_library(${TGT} STATIC ${OUTPUT_FILES})
+    set_target_properties(${TGT} PROPERTIES LINKER_LANGUAGE CXX)
+    target_include_directories(${TGT} PUBLIC "${ARG_INCLUDE_DIR}")
+    target_link_libraries(${TGT} PUBLIC flatbuffers-1.12)
+  endfunction(FlatBuffers_Target)
+endif(FlatBuffers_FOUND)
diff --git a/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfigVersion.cmake b/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfigVersion.cmake
new file mode 100644
index 000000000..8cfdbf8e5
--- /dev/null
+++ b/infra/cmake/packages/FlatBuffers-1.12/FlatBuffersConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "1.12")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
index da084e7d3..e551e29c8 100644
--- a/infra/cmake/packages/FlatBuffersConfig.cmake
+++ b/infra/cmake/packages/FlatBuffersConfig.cmake
@@ -5,6 +5,7 @@ endfunction(_FlatBuffers_import)
 
 function(_FlatBuffers_build)
   if(NOT BUILD_FLATBUFFERS)
+    message(STATUS "FlatBuffersConfig skip: BUILD_FLATBUFFERS OFF")
     return()
   endif(NOT BUILD_FLATBUFFERS)
 
@@ -12,6 +13,7 @@ function(_FlatBuffers_build)
 
   if(NOT FlatBuffersSource_FOUND)
     # Source is not available
+    message(STATUS "FlatBuffersConfig skip: FlatBuffersSource not found")
     return()
   endif(NOT FlatBuffersSource_FOUND)
 
@@ -22,12 +24,12 @@ function(_FlatBuffers_build)
 
   nnas_include(ExternalBuildTools)
   ExternalBuild_CMake(CMAKE_DIR   ${FlatBuffersSource_DIR}
-                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
-                      INSTALL_DIR ${EXT_OVERLAY_DIR}
+                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS-1.10/build
+                      INSTALL_DIR ${EXT_OVERLAY_DIR}/FLATBUFFERS-1.10
                       BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
-                      IDENTIFIER  "1.10-fix2"
-                      EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
-                      PKG_NAME    "FLATBUFFERS")
+                      IDENTIFIER  "1.10-fix4"
+                      EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF -DPOSITION_INDEPENDENT_CODE:BOOL=ON"
+                      PKG_NAME    "FLATBUFFERS-1.10")
 
 endfunction(_FlatBuffers_build)
 
@@ -35,11 +37,11 @@ _FlatBuffers_build()
 _FlatBuffers_import()
 
 if(FlatBuffers_FOUND)
-  if(NOT TARGET flatbuffers)
-    add_library(flatbuffers INTERFACE)
-    target_link_libraries(flatbuffers INTERFACE flatbuffers::flatbuffers)
-    message(STATUS "Found FlatBuffers: TRUE")
-  endif(NOT TARGET flatbuffers)
+  if(NOT TARGET flatbuffers-1.10)
+    add_library(flatbuffers-1.10 INTERFACE)
+    target_link_libraries(flatbuffers-1.10 INTERFACE flatbuffers::flatbuffers)
+    message(STATUS "Found FlatBuffers-1.10: TRUE")
+  endif(NOT TARGET flatbuffers-1.10)
 
   function(FlatBuffers_Generate PREFIX OUTPUT_DIR SCHEMA_DIR)
     get_filename_component(abs_output_dir ${OUTPUT_DIR} ABSOLUTE)
@@ -111,6 +113,6 @@ if(FlatBuffers_FOUND)
     add_library(${TGT} STATIC ${OUTPUT_FILES})
     set_target_properties(${TGT} PROPERTIES LINKER_LANGUAGE CXX)
     target_include_directories(${TGT} PUBLIC "${ARG_INCLUDE_DIR}")
-    target_link_libraries(${TGT} PUBLIC flatbuffers)
+    target_link_libraries(${TGT} PUBLIC flatbuffers-1.10)
   endfunction(FlatBuffers_Target)
 endif(FlatBuffers_FOUND)
diff --git a/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfig.cmake b/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfig.cmake
new file mode 100644
index 000000000..8b1743066
--- /dev/null
+++ b/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfig.cmake
@@ -0,0 +1,13 @@
+function(_MbedOSSource_import)
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(MBEDOS_6_15_URL https://github.com/ARMmbed/mbed-os/archive/refs/tags/mbed-os-6.15.0.tar.gz)
+
+  ExternalSource_Download(MBEDOS DIRNAME MBEDOS-6.15 ${MBEDOS_6_15_URL})
+
+  set(MbedOSSource_DIR ${MBEDOS_SOURCE_DIR} PARENT_SCOPE)
+  set(MbedOSSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_MbedOSSource_import)
+
+_MbedOSSource_import()
diff --git a/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfigVersion.cmake b/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfigVersion.cmake
new file mode 100644
index 000000000..acdd54ad6
--- /dev/null
+++ b/infra/cmake/packages/MbedOSSource-6.15/MbedOSSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "6.15")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfig.cmake b/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfig.cmake
new file mode 100644
index 000000000..a9ec75d34
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_TensorFlowEigenSource_import)
+  if(NOT DOWNLOAD_EIGEN)
+    set(TensorFlowEigenSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_EIGEN)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # Exact version used by TensorFlow v2.6.0.
+  # See tensorflow/third_party/eigen3/workspace.bzl.
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://gitlab.com")
+  envoption(TENSORFLOW_2_6_0_EIGEN_URL ${EXTERNAL_DOWNLOAD_SERVER}/libeigen/eigen/-/archive/12e8d57108c50d8a63605c6eb0144c838c128337/eigen-12e8d57108c50d8a63605c6eb0144c838c128337.tar.gz)
+
+  ExternalSource_Download(EIGEN DIRNAME TENSORFLOW-2.6.0-EIGEN ${TENSORFLOW_2_6_0_EIGEN_URL})
+
+  set(TensorFlowEigenSource_DIR ${EIGEN_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowEigenSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowEigenSource_import)
+
+_TensorFlowEigenSource_import()
diff --git a/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfigVersion.cmake
new file mode 100644
index 000000000..38ad0aa31
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowEigenSource-2.6.0/TensorFlowEigenSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.6.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfig.cmake b/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfig.cmake
new file mode 100644
index 000000000..b7f3148e8
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfig.cmake
@@ -0,0 +1,20 @@
+function(_TensorFlowGEMMLowpSource_import)
+  if(NOT DOWNLOAD_GEMMLOWP)
+    set(TensorFlowGEMMLowpSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_GEMMLOWP)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # Exact version used by TensorFlow v2.6.0.
+  # See tensorflow/third_party/gemmlowp/workspace.bzl.
+  envoption(TENSORFLOW_2_6_0_GEMMLOWP_URL https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip)
+
+  ExternalSource_Download(GEMMLOWP DIRNAME TENSORFLOW-2.6.0-GEMMLOWP ${TENSORFLOW_2_6_0_GEMMLOWP_URL})
+
+  set(TensorFlowGEMMLowpSource_DIR ${GEMMLOWP_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowGEMMLowpSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowGEMMLowpSource_import)
+
+_TensorFlowGEMMLowpSource_import()
diff --git a/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfigVersion.cmake
new file mode 100644
index 000000000..38ad0aa31
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowGEMMLowpSource-2.6.0/TensorFlowGEMMLowpSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.6.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfig.cmake b/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfig.cmake
new file mode 100644
index 000000000..b4dee914f
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfig.cmake
@@ -0,0 +1,20 @@
+function(_TensorFlowRuySource_import)
+  if(NOT DOWNLOAD_RUY)
+    set(TensorFlowRuySource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_RUY)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # Exact version used by TensorFlow v2.6.0.
+  # See tensorflow/third_party/ruy/workspace.bzl
+  envoption(TENSORFLOW_2_6_0_RUY_URL https://github.com/google/ruy/archive/e6c1b8dc8a8b00ee74e7268aac8b18d7260ab1ce.zip)
+
+  ExternalSource_Download(RUY DIRNAME TENSORFLOW-2.6.0-RUY ${TENSORFLOW_2_6_0_RUY_URL})
+
+  set(TensorFlowRuySource_DIR ${RUY_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowRuySource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowRuySource_import)
+
+_TensorFlowRuySource_import()
diff --git a/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfigVersion.cmake
new file mode 100644
index 000000000..38ad0aa31
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowRuySource-2.6.0/TensorFlowRuySourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.6.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfig.cmake b/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfig.cmake
new file mode 100644
index 000000000..611c7c805
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfig.cmake
@@ -0,0 +1,18 @@
+function(_TensorFlowSource_import)
+  if(NOT DOWNLOAD_TENSORFLOW)
+    set(TensorFlowSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_TENSORFLOW)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(TENSORFLOW_2_6_0_URL https://github.com/tensorflow/tensorflow/archive/v2.6.0.tar.gz)
+
+  ExternalSource_Download(TENSORFLOW DIRNAME TENSORFLOW-2.6.0 ${TENSORFLOW_2_6_0_URL})
+
+  set(TensorFlowSource_DIR ${TENSORFLOW_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowSource_import)
+
+_TensorFlowSource_import()
diff --git a/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfigVersion.cmake
new file mode 100644
index 000000000..38ad0aa31
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowSource-2.6.0/TensorFlowSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.6.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/debian/compiler/changelog b/infra/debian/compiler/changelog
index 6859255ff..12af5f928 100644
--- a/infra/debian/compiler/changelog
+++ b/infra/debian/compiler/changelog
@@ -1,3 +1,9 @@
+one (1.18.0) bionic; urgency=medium
+
+  * More optimization pass
+
+ -- seongwoo <mhs4670go@naver.com>  Fri, 15 Oct 2021 15:23:20 +0900
+
 one (1.17.0) bionic; urgency=medium
 
   * More optimization pass
diff --git a/infra/debian/compiler/one-compiler.install b/infra/debian/compiler/one-compiler.install
index ba628545b..cbca47802 100644
--- a/infra/debian/compiler/one-compiler.install
+++ b/infra/debian/compiler/one-compiler.install
@@ -3,7 +3,6 @@
 usr/bin/circle2circle usr/share/one/bin/
 usr/bin/circle_partitioner usr/share/one/bin/
 usr/bin/circle-quantizer usr/share/one/bin/
-usr/bin/conv_mixin_1.8.0.patch usr/share/one/bin/
 usr/bin/generate_bcq_metadata.py usr/share/one/bin/
 usr/bin/generate_bcq_output_arrays.py usr/share/one/bin/
 usr/bin/model2nnpkg.sh usr/share/one/bin/
diff --git a/infra/debian/compiler/one-compiler.links b/infra/debian/compiler/one-compiler.links
index 8b6e542c1..9e464352a 100644
--- a/infra/debian/compiler/one-compiler.links
+++ b/infra/debian/compiler/one-compiler.links
@@ -13,4 +13,5 @@ usr/share/one/lib/libluci_log.so usr/lib/libluci_log.so
 usr/share/one/lib/libluci_partition.so usr/lib/libluci_partition.so
 usr/share/one/lib/libluci_pass.so usr/lib/libluci_pass.so
 usr/share/one/lib/libluci_profile.so usr/lib/libluci_profile.so
+usr/share/one/lib/libluci_plan.so usr/lib/libluci_plan.so
 usr/share/one/lib/libluci_service.so usr/lib/libluci_service.so
diff --git a/infra/debian/compiler/rules b/infra/debian/compiler/rules
index 21b956b2f..e42faae09 100755
--- a/infra/debian/compiler/rules
+++ b/infra/debian/compiler/rules
@@ -1,7 +1,7 @@
 #!/usr/bin/make -f
 export DH_VERBOSE = 1
 export NNAS_BUILD_PREFIX = build
-export PRESET = 20210706
+export PRESET = 20210910
 export _DESTDIR = debian/tmp/usr
 
 %:
diff --git a/infra/debian/runtime/changelog b/infra/debian/runtime/changelog
index 4a41d959c..ee0d3e6ee 100644
--- a/infra/debian/runtime/changelog
+++ b/infra/debian/runtime/changelog
@@ -1,3 +1,9 @@
+one (1.18.0) bionic; urgency=low
+
+  * Synch up version with ONE Compiler
+
+ --  Chunseok Lee <chunseok.lee@samsung.com>  Fri, 15 Oct 2021 15:23:00 +0900
+
 one (1.17.0) bionic; urgency=low
 
   * New gpu_gl backend supports the following operations : Add, Convolution, Depthwise Convolution, Pooling, Reshape, Relu, Softmax
diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
index eb279902e..bde684938 100644
--- a/infra/nncc/CMakeLists.txt
+++ b/infra/nncc/CMakeLists.txt
@@ -130,6 +130,11 @@ option(ENABLE_STRICT_BUILD "Treat warning as error" OFF)
 # Check our ProtobufConfig.cmake for its usage.
 option(USE_PROTOBUF_LEGACY_IMPORT "Use legacy MODULE mode import rather than CONFIG mode" OFF)
 
+# This option might be turned ON for MCU builds of luci related components.
+# It specify which library type to use for build:
+# if set ON - luci libraries are static, otherwise - shared.
+option(STATIC_LUCI "Build luci as a static libraries" OFF)
+
 ###
 ### Target
 ###
diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
index 64aaace9b..65aea8bae 100644
--- a/infra/nncc/command/utcount
+++ b/infra/nncc/command/utcount
@@ -14,7 +14,7 @@ oops pepper-assert \
 hermes hermes-std \
 loco locop locomotiv logo-core logo \
 foder souschef arser vconone crew \
-safemain mio-circle mio-tflite \
+safemain mio-circle mio-tflite mio-tflite260 \
 tflite2circle \
 luci \
 luci-interpreter \
diff --git a/infra/packaging/build b/infra/packaging/build
index 8d3230010..53d63713b 100644
--- a/infra/packaging/build
+++ b/infra/packaging/build
@@ -8,7 +8,7 @@ if [[ -z "${NNAS_PROJECT_PATH}" ]]; then
 fi
 
 # The default preset
-PRESET="20210706"
+PRESET="20210910"
 
 EXTRA_OPTIONS=()
 while [ "$#" -ne 0 ]; do
diff --git a/infra/packaging/preset/20210910 b/infra/packaging/preset/20210910
new file mode 100644
index 000000000..d00b1ccad
--- /dev/null
+++ b/infra/packaging/preset/20210910
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# NOTE purpose of this file is static analysis only
+#      new official preset will be added when new programs are ready
+
+PRESET="20210910"
+
+function preset_configure()
+{
+  REQUIRED_UNITS=()
+  # Common Libraries
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
+  REQUIRED_UNITS+=("oops" "pepper-assert" "pepper-csv2vec" "foder" "crew")
+  REQUIRED_UNITS+=("souschef")
+  REQUIRED_UNITS+=("safemain")
+  REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
+  # Hermes Logging Framework
+  REQUIRED_UNITS+=("hermes" "hermes-std")
+  # loco IR and related utilities
+  REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
+  # Flatbuffer I/O
+  REQUIRED_UNITS+=("mio-tflite" "mio-tflite260" "mio-circle")
+  # Circle compiler library (.circle -> .circle)
+  REQUIRED_UNITS+=("luci")
+  # Tools
+  REQUIRED_UNITS+=("tflite2circle" "circle2circle" "tflchef" "circlechef")
+  REQUIRED_UNITS+=("circle-tensordump" "circledump")
+  REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
+  REQUIRED_UNITS+=("luci-eval-driver")
+  REQUIRED_UNITS+=("record-minmax" "circle-quantizer" "rawdata2hdf5")
+  REQUIRED_UNITS+=("circle-partitioner")
+  REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
+
+  NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
+
+  # TODO Use "nncc configure" and "nncc build"
+  cmake \
+    -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+    -DCMAKE_BUILD_TYPE=release \
+    -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+    ${EXTRA_OPTIONS[@]} \
+    "${NNAS_PROJECT_PATH}/infra/nncc"
+}
+
+function preset_install()
+{
+  install -t "${NNPKG_INSTALL_PREFIX}/bin" -D \
+    "${NNAS_PROJECT_PATH}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh"
+
+  # Install tf2nnpkg
+  install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+}
diff --git a/infra/packaging/preset/20210910_windows b/infra/packaging/preset/20210910_windows
new file mode 100644
index 000000000..642bdbd76
--- /dev/null
+++ b/infra/packaging/preset/20210910_windows
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+function preset_configure()
+{
+  REQUIRED_UNITS=()
+  # Common Libraries
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
+  REQUIRED_UNITS+=("oops" "pepper-assert" "pepper-csv2vec" "foder" "crew")
+  REQUIRED_UNITS+=("souschef")
+  REQUIRED_UNITS+=("safemain")
+  REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
+  # Hermes Logging Framework
+  REQUIRED_UNITS+=("hermes" "hermes-std")
+  # loco IR and related utilities
+  REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
+  # Flatbuffer I/O
+  REQUIRED_UNITS+=("mio-tflite" "mio-tflite260" "mio-circle")
+  # Circle compiler library (.circle -> .circle)
+  REQUIRED_UNITS+=("luci")
+  # Tools
+  REQUIRED_UNITS+=("tflite2circle" "circle2circle" "tflchef" "circlechef")
+  REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
+  REQUIRED_UNITS+=("luci-eval-driver")
+  REQUIRED_UNITS+=("record-minmax" "circle-quantizer" "rawdata2hdf5")
+  REQUIRED_UNITS+=("circle-partitioner")
+  REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
+
+  NPROC=$(cat /proc/cpuinfo | grep -c processor)
+
+  # TODO Use "nncc configure" and "nncc build"
+  cmake \
+    -G "MSYS Makefiles" \
+    -DUSE_PROTOBUF_LEGACY_IMPORT=ON \
+    -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DCMAKE_SHARED_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DENABLE_TEST=OFF \
+    -DDOWNLOAD_GTEST=OFF \
+    -DBUILD_GTEST=OFF \
+    -DCMAKE_C_COMPILER=gcc \
+    -DCMAKE_CXX_COMPILER=g++ \
+    -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+    -DCMAKE_BUILD_TYPE=release \
+    -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+    ${EXTRA_OPTIONS[@]} \
+    "${NNAS_PROJECT_PATH}/infra/nncc"
+}
+
+function preset_install()
+{
+  # Install libraries to bin/ for Windows release
+  mv ${NNCC_INSTALL_PREFIX}/lib/*.dll ${NNCC_INSTALL_PREFIX}/bin
+  rm -rf ${NNCC_INSTALL_PREFIX}/lib
+
+  install -t "${NNPKG_INSTALL_PREFIX}/bin" -D \
+    "${NNAS_PROJECT_PATH}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh"
+
+  # Install tf2nnpkg
+  install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.20210910" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+
+  # Though you have to install tensorflow to run 'tf2tfliteV2',
+  # tensorflow can't be installed in mingw. First, You can install tensorflow 
+  # from Window native CMD(run as administrator) with python virtual environment.
+  # And, you must copy it to "${NNAS_INSTALL_PREFIX}/bin/venv"
+}
diff --git a/infra/packaging/res/tf2nnpkg.20210910 b/infra/packaging/res/tf2nnpkg.20210910
new file mode 100644
index 000000000..0d44818a1
--- /dev/null
+++ b/infra/packaging/res/tf2nnpkg.20210910
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+set -e
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+command_exists() {
+  if [ "$#" -le 0 ]; then
+    return 1
+  fi
+  command -v "$@" > /dev/null 2>&1
+}
+
+usage()
+{
+  echo "Convert TensorFlow model to nnpackage."
+  echo "Usage: tf2nnpkg"
+  echo "    --info <path/to/info>"
+  echo "    --graphdef <path/to/pb>"
+  echo "    -o <path/to/nnpkg/directory>"
+  echo "    --v2 (optional) Use TF 2.x interface"
+  exit 255
+}
+
+TF_INTERFACE="--v1"
+
+# Parse command-line arguments
+#
+while [ "$#" -ne 0 ]; do
+  CUR="$1"
+
+  case $CUR in
+    '--help')
+      usage
+      ;;
+    '--info')
+      export INFO_FILE="$2"
+      shift 2
+      ;;
+    '--graphdef')
+      export GRAPHDEF_FILE="$2"
+      shift 2
+      ;;
+    '-o')
+      export OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
+    *)
+      echo "${CUR}"
+      shift
+      ;;
+  esac
+done
+
+if [ -z ${GRAPHDEF_FILE} ] || [ ! -e ${GRAPHDEF_FILE} ]; then
+  echo "pb is not found. Please check --graphdef is correct."
+  exit 2
+fi
+
+if [ -z ${INFO_FILE} ] || [ ! -e ${INFO_FILE} ]; then
+  echo "info is not found. Please check --info is correct."
+  exit 2
+fi
+
+if [ -z ${OUTPUT_DIR} ]; then
+  echo "output directory is not specifed. Please check -o is correct.."
+  exit 2
+fi
+
+FILE_BASE=$(basename ${GRAPHDEF_FILE})
+MODEL_NAME="${FILE_BASE%.*}"
+TMPDIR=$(mktemp -d)
+trap "{ rm -rf $TMPDIR; }" EXIT
+
+# activate python virtual environment
+VIRTUALENV_LINUX="${ROOT}/bin/venv/bin/activate"
+VIRTUALENV_WINDOWS="${ROOT}/bin/venv/Scripts/activate"
+
+if [ -e ${VIRTUALENV_LINUX} ]; then
+  source ${VIRTUALENV_LINUX}
+elif [ -e ${VIRTUALENV_WINDOWS} ]; then
+  source ${VIRTUALENV_WINDOWS}
+fi
+
+# parse inputs, outputs from info file
+INPUT=$(awk -F, '/^input/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' | paste -d, -s)
+OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' | paste -d, -s)
+
+INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
+
+ONE_IMPORT_BCQ_SCRIPT="${ROOT}/bin/one-import-bcq ${TF_INTERFACE} "
+ONE_IMPORT_BCQ_SCRIPT+="-i ${GRAPHDEF_FILE} "
+ONE_IMPORT_BCQ_SCRIPT+="-o ${TMPDIR}/${MODEL_NAME}.tmp.circle "
+ONE_IMPORT_BCQ_SCRIPT+="-I ${INPUT} "
+ONE_IMPORT_BCQ_SCRIPT+="-O ${OUTPUT} "
+if [ ! -z ${INPUT_SHAPES} ]; then
+  ONE_IMPORT_BCQ_SCRIPT+="-s ${INPUT_SHAPES} "
+fi
+
+${ONE_IMPORT_BCQ_SCRIPT}
+
+# optimize
+"${ROOT}/bin/circle2circle" --O1 "${TMPDIR}/${MODEL_NAME}.tmp.circle" "${TMPDIR}/${MODEL_NAME}.circle"
+
+"${ROOT}/bin/model2nnpkg.sh" -o "${OUTPUT_DIR}" "${TMPDIR}/${MODEL_NAME}.circle"
diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
index a63140eaf..e520dd381 100644
--- a/infra/scripts/compiler_modules.sh
+++ b/infra/scripts/compiler_modules.sh
@@ -8,7 +8,7 @@ DEBUG_BUILD_ITEMS+=";oops;pepper-assert;pepper-csv2vec"
 DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
 DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
 DEBUG_BUILD_ITEMS+=";foder;crew;souschef;arser;vconone"
-DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
+DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite;mio-tflite260"
 DEBUG_BUILD_ITEMS+=";tflite2circle"
 DEBUG_BUILD_ITEMS+=";luci"
 DEBUG_BUILD_ITEMS+=";luci-interpreter"
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index 65963f4b8..475da6d06 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -71,7 +71,7 @@ REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
 # Circle compiler library (.circle -> .circle)
 REQUIRED_UNITS+=("luci")
 # Flatbuffer I/O
-REQUIRED_UNITS+=("mio-tflite" "mio-circle")
+REQUIRED_UNITS+=("mio-tflite" "mio-tflite260" "mio-circle")
 # Tools
 REQUIRED_UNITS+=("tflite2circle" "circle2circle" "luci-interpreter")
 REQUIRED_UNITS+=("souschef" "tflchef" "circlechef" "circle-verify")
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index 0d170e7ed..4133d7a06 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -1,9 +1,9 @@
 Name:    nnfw
 Summary: nnfw
-Version: 1.17.0
+Version: 1.18.0
 Release: 1
 Group:   Development
-License: Apache-2.0 and MIT and BSD-2-Clause
+License: Apache-2.0 and MIT and BSD-2-Clause and MPL-2.0
 
 Source0: %{name}-%{version}.tar.gz
 Source1: %{name}.manifest
diff --git a/res/TensorFlowLiteRecipes/PadV2_001/test.recipe b/res/TensorFlowLiteRecipes/PadV2_001/test.recipe
new file mode 100644
index 000000000..0eafec931
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/PadV2_001/test.recipe
@@ -0,0 +1,68 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "relu"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "padding"
+  type: INT32
+  shape { dim: 4 dim: 2 }
+  filler {
+    tag: "explicit"
+    arg: "0" arg: "0"
+    arg: "1" arg: "1"
+    arg: "1" arg: "1"
+    arg: "0" arg: "0"
+  }
+}
+operand {
+  name: "constant_values"
+  type: FLOAT32
+  shape { dim: 1 }
+  filler {
+    tag: "explicit"
+    arg: "-100.00"
+  }
+}
+operand {
+  name: "padv2"
+  type: FLOAT32
+  shape { dim: 1 dim: 5 dim: 5 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "ReLU"
+  input: "ifm"
+  output: "relu"
+}
+operation {
+  type: "PadV2"
+  input: "relu"
+  input: "padding"
+  input: "constant_values"
+  output: "padv2"
+}
+operation {
+  type: "MaxPool2D"
+  maxpool2d_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+    filter_height: 3
+    filter_width: 3
+  }
+  input: "padv2"
+  output: "ofm"
+}
+
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/PadV2_001/test.rule b/res/TensorFlowLiteRecipes/PadV2_001/test.rule
new file mode 100644
index 000000000..29b080b1e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/PadV2_001/test.rule
@@ -0,0 +1,8 @@
+# To check if PadV2 is converted to Pad
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "PAD_EXIST"               $(op_count PAD) '=' 1
+RULE    "MAXPOOL2D_EXIST"         $(op_count MAX_POOL_2D) '=' 1
+RULE    "RELU_EXIST"              $(op_count RELU) '=' 1
+RULE    "NO_PADV2"                $(op_count PADV2) '=' 0
diff --git a/res/TensorFlowLiteSchema/2.6.0/schema.fbs b/res/TensorFlowLiteSchema/2.6.0/schema.fbs
new file mode 100644
index 000000000..6fc51f838
--- /dev/null
+++ b/res/TensorFlowLiteSchema/2.6.0/schema.fbs
@@ -0,0 +1,1240 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+// Version 3a: Add new builtin op code field. Has backward compatibility with
+//             version 3.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+  UINT64 = 12,
+  // Experimental: Resource and variant types are experimental, that are subject
+  // to change. Do not implement custom kernels using resource & variant types
+  // now.
+  RESOURCE = 13,
+  VARIANT = 14,
+  UINT32 = 15,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+// LINT.IfChange
+enum BuiltinOperator : int32 {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  CUMSUM = 128,
+  CALL_ONCE = 129,
+  BROADCAST_TO = 130,
+  RFFT2D = 131,
+  CONV_3D = 132,
+  IMAG=133,
+  REAL=134,
+  COMPLEX_ABS=135,
+  HASHTABLE = 136,
+  HASHTABLE_FIND = 137,
+  HASHTABLE_IMPORT = 138,
+  HASHTABLE_SIZE = 139,
+  REDUCE_ALL = 140,
+  CONV_3D_TRANSPOSE = 141,
+  VAR_HANDLE = 142,
+  READ_VARIABLE = 143,
+  ASSIGN_VARIABLE = 144,
+}
+// LINT.ThenChange(nnapi_linter/linter.proto)
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions,
+  CumsumOptions,
+  CallOnceOptions,
+  BroadcastToOptions,
+  Rfft2dOptions,
+  Conv3DOptions,
+  HashtableOptions,
+  HashtableFindOptions,
+  HashtableImportOptions,
+  HashtableSizeOptions,
+  VarHandleOptions,
+  ReadVariableOptions,
+  AssignVariableOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+// Options for both Conv3D and Conv3DTranspose.
+table Conv3DOptions {
+  padding:Padding;
+  stride_d:int;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_d_factor:int = 1;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 3.
+  pot_scale_int16:bool = true;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  // This field is currently ignored in the L2 Norm Op.
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+  // Parameters for Gather version 5 or above.
+  batch_dims: int = 0;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table CallOnceOptions {
+  init_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+  // Parameters for BatchMatMul version 4 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table CumsumOptions {
+  exclusive:bool;
+  reverse:bool;
+}
+
+table BroadcastToOptions {
+}
+
+table Rfft2dOptions {
+}
+
+table HashtableOptions {
+  // The identity of hash tables. This identity will be used across different
+  // subgraphs in the same interpreter instance.
+  table_id:int;
+  key_dtype:TensorType;
+  value_dtype:TensorType;
+}
+
+table HashtableFindOptions {
+}
+
+table HashtableImportOptions {
+}
+
+table HashtableSizeOptions {
+}
+
+table VarHandleOptions {
+  container:string;
+  shared_name:string;
+}
+
+table ReadVariableOptions {
+}
+
+table AssignVariableOptions {
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  // This field is for backward compatibility. This field will be used when
+  // the value of the extended builtin_code field has less than
+  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  deprecated_builtin_code:byte;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+
+  // This field is introduced for resolving op builtin code shortage problem
+  // (the original BuiltinOperator enum field was represented as a byte).
+  // This field will be used when the value of the extended builtin_code field
+  // has greater than BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  builtin_code:BuiltinOperator;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+// Map from an alias name of tensor to tensor index in the graph.
+// This is used in Signature def.
+table TensorMap {
+  // Represents the alias to use for this tensor.
+  name:string;
+
+  // The actual tensor index in the primary graph, that 'name' corresponds to.
+  tensor_index:uint;
+}
+
+// This corresponds to SignatureDef in Tensorflow SavedModel.
+// The SignatureDef will be part of the SavedModel provided for conversion.
+table SignatureDef {
+  // Named inputs for this signature.
+  inputs:[TensorMap];
+
+  // Named outputs for this signature.
+  outputs:[TensorMap];
+
+  // Exported method name for this signature.
+  method_name:string;
+
+  // Key value which was in the Tensorflow SavedModel SignatureDef map.
+  key:string;
+
+  // Subgraph index of the exported method.
+  subgraph_index:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+
+  // Optional SignatureDefs for the model.
+  signature_defs:[SignatureDef];
+}
+
+root_type Model;
diff --git a/res/TensorFlowLiteSchema/SCHEMA.lst b/res/TensorFlowLiteSchema/SCHEMA.lst
index 73dfacd7b..609ef4b0b 100644
--- a/res/TensorFlowLiteSchema/SCHEMA.lst
+++ b/res/TensorFlowLiteSchema/SCHEMA.lst
@@ -6,3 +6,4 @@ VERSION,URL
 2.2.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.2.0/tensorflow/lite/schema/schema.fbs
 2.3.0-rc0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.3.0-rc0/tensorflow/lite/schema/schema.fbs
 2.3.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.3.0/tensorflow/lite/schema/schema.fbs
+2.6.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.6.0/tensorflow/lite/schema/schema.fbs
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index 2e3955c3a..b432929b5 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,7 +8,7 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.17.0"
+        versionName "1.18.0"
 
         externalNativeBuild {
             ndkBuild {
diff --git a/runtime/libs/ndarray/CMakeLists.txt b/runtime/libs/ndarray/CMakeLists.txt
new file mode 100644
index 000000000..f88f13186
--- /dev/null
+++ b/runtime/libs/ndarray/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_library(ndarray STATIC src/Array.cpp src/ContiguousSpan.cpp)
+
+set_target_properties(ndarray PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(ndarray PUBLIC include)
+#can't make this private because of c++ templates
+target_include_directories(ndarray PUBLIC src)
+
+option(NDARRAY_INLINE_TEMPLATES "Set to ON to disable extern declarations for common types")
+
+if(${NDARRAY_INLINE_TEMPLATES})
+    target_compile_definitions(ndarray PUBLIC -DNDARRAY_INLINE_TEMPLATES=1)
+endif()
+
+target_link_libraries(ndarray PRIVATE nnfw_common)
+target_link_libraries(ndarray PRIVATE nnfw_coverage)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+add_subdirectory(test)
+add_subdirectory(example)
diff --git a/runtime/libs/ndarray/example/CMakeLists.txt b/runtime/libs/ndarray/example/CMakeLists.txt
new file mode 100644
index 000000000..c4b575dad
--- /dev/null
+++ b/runtime/libs/ndarray/example/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(example_no_array example_no_array.cpp)
+
+add_executable(example_array example_array.cpp)
+target_link_libraries(example_array PRIVATE ndarray)
diff --git a/runtime/libs/ndarray/example/example_array.cpp b/runtime/libs/ndarray/example/example_array.cpp
new file mode 100644
index 000000000..85d274681
--- /dev/null
+++ b/runtime/libs/ndarray/example/example_array.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ndarray/Array.h"
+
+#include <iostream>
+#include <iterator>
+
+using namespace ndarray;
+
+void gather_array(const Array<float> &input, Array<float> &output, const Array<int> &indices)
+{
+  assert(indices.shape().rank() == 3);
+  assert(input.shape().rank() == 3);
+  assert(indices.shape().dim(1) == input.shape().rank());
+
+  for (size_t i = 0; i < indices.shape().dim(0); ++i)
+  {
+    for (size_t j = 0; j < indices.shape().dim(1); ++j)
+    {
+      auto index = indices.slice(i, j);
+      output.slice(i, j).assign(input.slice(index[0], index[1]));
+    }
+  }
+}
+
+int main()
+{
+  // fill tensor of shape[3,3,4] with sequential numbers from [0..36)
+  Shape in_shape{3, 3, 4};
+  std::vector<float> input_data(in_shape.element_count());
+  for (size_t i = 0; i < in_shape.element_count(); ++i)
+    input_data[i] = i;
+
+  Array<float> input(input_data.data(), in_shape);
+
+  // select column-vectors on main diagonal
+  Shape indices_shape{1, 3, 2};
+  std::vector<int> indices_data(indices_shape.element_count());
+  Array<int> indices(indices_data.data(), indices_shape);
+
+  indices.slice(0, 0) = {0, 0};
+  indices.slice(0, 1) = {1, 1};
+  indices.slice(0, 2) = {2, 2};
+
+  Shape output_shape{1, 3, 4};
+  std::vector<float> output_data(output_shape.element_count());
+
+  Array<float> output(output_data.data(), output_shape);
+
+  gather_array(input, output, indices);
+
+  for (size_t i = 0; i < indices_shape.dim(0); ++i)
+  {
+    for (size_t j = 0; j < indices_shape.dim(1); ++j)
+    {
+      auto output_piece = output.slice(i, j);
+      std::ostream_iterator<int> cout_it(std::cout, ", ");
+      std::copy(output_piece.begin(), output_piece.end(), cout_it);
+      std::cout << std::endl;
+    }
+  }
+}
diff --git a/runtime/libs/ndarray/example/example_no_array.cpp b/runtime/libs/ndarray/example/example_no_array.cpp
new file mode 100644
index 000000000..3a4d05dca
--- /dev/null
+++ b/runtime/libs/ndarray/example/example_no_array.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+
+void gather_no_array(const float *in_data, const std::array<size_t, 3> &dims, float *out_data,
+                     const std::array<size_t, 3> &out_dims, //[nselections,
+                     const int *indices, const std::array<size_t, 3> &indices_dims)
+{
+  assert(indices_dims[1] == dims.size());
+
+  for (int i = 0; i < indices_dims[0]; ++i)
+  {
+    for (int j = 0; j < indices_dims[1]; ++j)
+    {
+      const int *index_ptr = indices + i * indices_dims[2] * indices_dims[1] + j * indices_dims[2];
+
+      size_t in_offset = index_ptr[0] * dims[2] * dims[1] + index_ptr[1] * dims[2];
+
+      const float *in_ptr = in_data + in_offset;
+
+      size_t out_offset = i * out_dims[2] * out_dims[1] + j * out_dims[2];
+
+      float *out_ptr = out_data + out_offset;
+
+      for (int k = 0; k < dims[2]; ++k)
+      {
+        out_ptr[k] = in_ptr[k];
+      }
+    }
+  }
+}
+
+int main()
+{
+  std::array<size_t, 3> in_dims{3, 3, 4};
+  std::vector<float> input(3 * 3 * 4);
+  for (size_t i = 0; i < 3 * 3 * 4; ++i)
+    input[i] = i;
+
+  std::array<size_t, 3> indices_shape{1, 3, 2};
+  std::vector<int> indices(1 * 3 * 2);
+
+  indices[0] = 0;
+  indices[1] = 0;
+  indices[2] = 1;
+  indices[3] = 1;
+  indices[4] = 2;
+  indices[5] = 2;
+
+  std::array<size_t, 3> output_dims{1, 3, 4};
+  std::vector<float> output(1 * 3 * 4);
+
+  gather_no_array(input.data(), in_dims, output.data(), output_dims, indices.data(), indices_shape);
+
+  for (size_t i = 0; i < output_dims[0]; ++i)
+  {
+    for (size_t j = 0; j < output_dims[1]; ++j)
+    {
+      auto out_ptr = output.data() + i * output_dims[1] * output_dims[2] + j * output_dims[2];
+      for (size_t k = 0; k < output_dims[2]; ++k)
+      {
+        std::cout << out_ptr[k] << ", ";
+      }
+      std::cout << std::endl;
+    }
+  }
+}
diff --git a/runtime/libs/ndarray/include/ndarray/Array.h b/runtime/libs/ndarray/include/ndarray/Array.h
new file mode 100644
index 000000000..09e791763
--- /dev/null
+++ b/runtime/libs/ndarray/include/ndarray/Array.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NDARRAY_ARRAY_H_
+#define _NDARRAY_ARRAY_H_
+
+#include "Common.h"
+
+#include "ContiguousSpan.h"
+#include "Shape.h"
+
+#if __cplusplus < 201402L
+#include "detail/cxx14.h" //integer_sequence and make_index_dequence definitions
+#else
+#include <utility>
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <type_traits>
+#include <array>
+#include <tuple>
+#include <cstddef>
+
+namespace ndarray
+{
+
+// there is no index_sequence before c++14
+#if __cplusplus < 201402L
+
+template <size_t... Nums> using index_sequence = cxx14::index_sequence<Nums...>;
+
+template <size_t Num> using make_index_sequence = cxx14::make_index_sequence<Num>;
+
+#else
+
+template <size_t... Nums> using index_sequence = std::index_sequence<Nums...>;
+
+template <size_t _Num> using make_index_sequence = std::make_index_sequence<_Num>;
+
+#endif //__cplusplus < 201402L
+
+struct Strides
+{
+  explicit Strides(Shape s) : _strides{} { fillStrides(s); }
+
+  int operator[](size_t idx) const noexcept { return _strides[idx]; }
+
+  // since we don't have c++14 fold expression
+  template <typename Seq, typename... Ts> struct _calc_offset;
+
+  template <size_t Num, size_t... Nums, typename T, typename... Ts>
+  struct _calc_offset<index_sequence<Num, Nums...>, T, Ts...>
+  {
+    static constexpr size_t get(const std::array<int, 8> &strides, int x, Ts... xs)
+    {
+      return _calc_offset<index_sequence<Nums...>, Ts...>::get(strides, xs...) +
+             x * std::get<Num>(strides);
+    }
+  };
+
+  template <size_t Num, typename T> struct _calc_offset<index_sequence<Num>, T>
+  {
+    static constexpr size_t get(const std::array<int, 8> &strides, int x)
+    {
+      return x * std::get<Num>(strides);
+    }
+  };
+
+  template <typename Seq, typename... Ts> constexpr size_t offset(Seq, Ts... x) const noexcept
+  {
+    // return ( 0 + ... + (std::get<Nums>(_strides) * x)); in c++14
+    return _calc_offset<Seq, Ts...>::get(_strides, x...);
+  }
+
+private:
+  void fillStrides(const Shape &s) noexcept
+  {
+    int rank = s.rank();
+    _strides[rank - 1] = 1;
+    for (int d = rank - 2; d >= 0; --d)
+    {
+      _strides[d] = _strides[d + 1] * s.dim(d + 1);
+    }
+  }
+
+  std::array<int, NDARRAY_MAX_DIMENSION_COUNT> _strides;
+};
+
+template <typename T> class Array
+{
+public:
+  Array(T *data, Shape shape) noexcept : _data(data), _shape(shape), _strides(shape) {}
+
+  Array(const Array &) = delete;
+
+  Array(Array &&a) noexcept : _data(a._data), _shape(a._shape), _strides(a._strides)
+  {
+    a._data = nullptr;
+  }
+
+  template <typename... Ts> T &at(Ts... x) const noexcept { return _at(static_cast<size_t>(x)...); }
+
+  /**
+   * @brief returns last dimension as ContigniousSpan
+   * @param x indices of slice to take. See tests for usage details
+   * @return slice at given position
+   */
+  template <typename... Ts> ContiguousSpan<T, std::is_const<T>::value> slice(Ts... x) noexcept
+  {
+    assert(sizeof...(Ts) == _shape.rank() - 1);
+    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
+  }
+
+  /**
+   * @brief returns last dimension as ContigniousSpan
+   * @param x indices of slice to take. See tests for usage details
+   * @return slice at given position
+   */
+  template <typename... Ts> ContiguousSpan<T, true> slice(Ts... x) const noexcept
+  {
+    assert(sizeof...(Ts) == _shape.rank() - 1);
+    return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)};
+  }
+
+  ContiguousSpan<T, std::is_const<T>::value> flat() noexcept
+  {
+    return {_data, _shape.element_count()};
+  }
+
+  ContiguousSpan<T, true> flat() const noexcept { return {_data, _shape.element_count()}; }
+
+  const Shape &shape() const noexcept { return _shape; }
+
+private:
+  template <typename... Ts> T &_at(Ts... x) const noexcept
+  {
+    assert(sizeof...(x) == _shape.rank());
+    using Indices = make_index_sequence<sizeof...(Ts)>;
+    return _data[offset(Indices{}, x...)];
+  }
+
+  template <typename... Ts, size_t... Nums>
+  size_t offset(index_sequence<Nums...> seq, Ts... x) const noexcept
+  {
+    static_assert(
+      sizeof...(Ts) == sizeof...(Nums),
+      "Sanity check failed. Generated index sequence size is not equal to argument count");
+
+    return _strides.offset(seq, x...);
+  }
+
+  T *_data;
+  Shape _shape;
+  Strides _strides;
+};
+
+template <typename To, typename From> Array<To> array_cast(Array<From> &&from, Shape newShape)
+{
+  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
+  return Array<To>(reinterpret_cast<To *>(from.flat().data()), newShape);
+}
+
+template <typename To, typename From>
+Array<const To> array_cast(const Array<From> &from, Shape newShape)
+{
+  assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count());
+  return Array<To>(reinterpret_cast<const To *>(from.flat().data()), newShape);
+}
+
+#ifndef NDARRAY_INLINE_TEMPLATES
+
+extern template class Array<float>;
+extern template class Array<int32_t>;
+extern template class Array<uint32_t>;
+extern template class Array<uint8_t>;
+
+#endif // NDARRAY_INLINE_TEMPLATES
+
+} // namespace ndarray
+
+#endif //_NDARRAY_ARRAY_H_
diff --git a/runtime/libs/ndarray/include/ndarray/Common.h b/runtime/libs/ndarray/include/ndarray/Common.h
new file mode 100644
index 000000000..aa0cc6fe2
--- /dev/null
+++ b/runtime/libs/ndarray/include/ndarray/Common.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NDARRAY_COMMON_H_
+#define _NDARRAY_COMMON_H_
+
+#define NDARRAY_MAX_DIMENSION_COUNT 8
+
+#endif //_NDARRAY_COMMON_H_
diff --git a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h b/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
new file mode 100644
index 000000000..b322b77db
--- /dev/null
+++ b/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NDARRAY_CONTIGNIOUS_SPAN_H_
+#define _NDARRAY_CONTIGNIOUS_SPAN_H_
+
+#include <type_traits>
+#include <vector>
+#include <cstdint>
+#include <cstddef>
+#include <cassert>
+
+namespace ndarray
+{
+
+template <typename T, bool isConst = false> class ContiguousSpan
+{
+public:
+  using pointer_type = typename std::conditional<isConst, const T *, T *>::type;
+  using reference_type = typename std::conditional<isConst, const T &, T &>::type;
+  using iterator_type = pointer_type;
+
+  ContiguousSpan(pointer_type data, size_t len) noexcept : _data(data), _len(len) {}
+
+  template <typename It>
+  explicit ContiguousSpan(It first, It last) noexcept
+    : _data(&*first), _len(std::distance(first, last))
+  {
+  }
+
+  ContiguousSpan(const ContiguousSpan &) = delete;
+
+  ContiguousSpan(ContiguousSpan &&s) noexcept : _data(s._data), _len(s._len) { s._data = nullptr; }
+
+  operator ContiguousSpan<T, true>() { return ContiguousSpan<T, true>{_data, _len}; }
+
+  reference_type operator[](size_t idx) const noexcept { return _data[idx]; }
+
+  reference_type at(size_t idx) const noexcept { return _data[idx]; }
+
+  ContiguousSpan<T, isConst> offset(size_t offset)
+  {
+    assert(offset <= _len);
+    return {_data + offset, _len - offset};
+  }
+
+  template <typename From, bool _ = isConst>
+  typename std::enable_if<!_, void>::type assign(const From &f) noexcept
+  {
+    assignFrom(std::begin(f), std::end(f));
+  }
+
+  template <typename U, bool _ = isConst>
+  typename std::enable_if<!_, ContiguousSpan &>::type
+  operator=(std::initializer_list<U> list) noexcept
+  {
+    assignFrom(std::begin(list), std::end(list));
+    return *this;
+  }
+
+  template <typename It, bool _ = isConst>
+  typename std::enable_if<!_, void>::type assignFrom(It first, It last) noexcept
+  {
+    std::copy(first, last, begin());
+  }
+
+  size_t size() const { return _len; }
+
+  iterator_type begin() const { return iterator_type{_data}; }
+
+  iterator_type end() const { return iterator_type{_data + _len}; }
+
+  pointer_type data() { return _data; }
+
+private:
+  pointer_type _data;
+  size_t _len;
+};
+
+#ifndef NDARRAY_INLINE_TEMPLATES
+
+extern template class ContiguousSpan<float, true>;
+extern template class ContiguousSpan<float, false>;
+extern template class ContiguousSpan<int32_t, true>;
+extern template class ContiguousSpan<int32_t, false>;
+extern template class ContiguousSpan<uint32_t, true>;
+extern template class ContiguousSpan<uint32_t, false>;
+extern template class ContiguousSpan<uint8_t, true>;
+extern template class ContiguousSpan<uint8_t, false>;
+
+#endif // NDARRAY_INLINE_TEMPLATES
+
+} // namespace ndarray
+
+#endif //_NDARRAY_CONTIGNIOUS_SPAN_H_
diff --git a/runtime/libs/ndarray/include/ndarray/Shape.h b/runtime/libs/ndarray/include/ndarray/Shape.h
new file mode 100644
index 000000000..fa58613b8
--- /dev/null
+++ b/runtime/libs/ndarray/include/ndarray/Shape.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NDARRAY_SHAPE_H_
+#define _NDARRAY_SHAPE_H_
+
+#include "Common.h"
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+
+namespace ndarray
+{
+
+class Shape
+{
+public:
+  //_dims{} here and later since array does not have std::initializer_list ctor
+  // and aggregate initialization is not allowed here
+  explicit Shape(size_t rank) noexcept : _dims{}, _rank(rank)
+  {
+    std::fill(_dims.begin(), _dims.end(), 0);
+  }
+
+  Shape(std::initializer_list<size_t> list) noexcept : _dims{}, _rank(list.size())
+  {
+    std::copy(list.begin(), list.end(), _dims.begin());
+  }
+
+  size_t dim(int i) const noexcept { return _dims.at(i); }
+
+  size_t &dim(int i) noexcept { return _dims.at(i); }
+
+  size_t element_count() const noexcept
+  {
+    uint32_t res = 1;
+    for (size_t i = 0; i < rank(); ++i)
+      res *= dim(i);
+    assert(res <= 0xffffffff);
+    return res;
+  }
+
+  size_t rank() const noexcept { return _rank; }
+
+private:
+  std::array<size_t, NDARRAY_MAX_DIMENSION_COUNT> _dims;
+  size_t _rank;
+};
+
+} // namespace ndarray
+
+#endif //_NDARRAY_SHAPE_H_
diff --git a/runtime/libs/ndarray/src/Array.cpp b/runtime/libs/ndarray/src/Array.cpp
new file mode 100644
index 000000000..f9c9de9d3
--- /dev/null
+++ b/runtime/libs/ndarray/src/Array.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ndarray/Array.h"
+
+namespace ndarray
+{
+
+template class Array<float>;
+template class Array<int32_t>;
+template class Array<uint32_t>;
+template class Array<uint8_t>;
+
+} // namespace ndarray
diff --git a/runtime/libs/ndarray/src/ContiguousSpan.cpp b/runtime/libs/ndarray/src/ContiguousSpan.cpp
new file mode 100644
index 000000000..e06cfc2a1
--- /dev/null
+++ b/runtime/libs/ndarray/src/ContiguousSpan.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ndarray/ContiguousSpan.h"
+
+namespace ndarray
+{
+
+template class ContiguousSpan<float, true>;
+template class ContiguousSpan<float, false>;
+template class ContiguousSpan<int32_t, true>;
+template class ContiguousSpan<int32_t, false>;
+template class ContiguousSpan<uint32_t, true>;
+template class ContiguousSpan<uint32_t, false>;
+template class ContiguousSpan<uint8_t, true>;
+template class ContiguousSpan<uint8_t, false>;
+
+} // namespace ndarray
diff --git a/runtime/libs/ndarray/src/detail/cxx14.h b/runtime/libs/ndarray/src/detail/cxx14.h
new file mode 100644
index 000000000..8b78fb985
--- /dev/null
+++ b/runtime/libs/ndarray/src/detail/cxx14.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NDARRAY_CXX14_H_
+#define _NDARRAY_CXX14_H_
+
+namespace ndarray
+{
+
+namespace cxx14
+{
+
+template <size_t... Nums> struct index_sequence
+{
+  using value_type = size_t;
+
+  static constexpr std::size_t size() noexcept { return sizeof...(Nums); }
+};
+
+namespace detail
+{
+
+template <size_t v, typename Seq> struct _append;
+
+template <size_t v, size_t... Nums> struct _append<v, index_sequence<Nums...>>
+{
+  using result = index_sequence<Nums..., v>;
+};
+
+template <size_t Len> struct make_index_sequence
+{
+  using result =
+    typename detail::_append<Len - 1, typename make_index_sequence<Len - 1>::result>::result;
+};
+
+template <> struct make_index_sequence<1>
+{
+  using result = index_sequence<0>;
+};
+
+template <> struct make_index_sequence<0>
+{
+  using result = index_sequence<>;
+};
+
+} // namespace detail
+
+template <size_t Num> using make_index_sequence = typename detail::make_index_sequence<Num>::result;
+
+} // namespace cxx14
+
+} // namespace ndarray
+
+#endif //_NDARRAY_CXX14_H_
diff --git a/runtime/libs/ndarray/test/CMakeLists.txt b/runtime/libs/ndarray/test/CMakeLists.txt
new file mode 100644
index 000000000..be1ed6510
--- /dev/null
+++ b/runtime/libs/ndarray/test/CMakeLists.txt
@@ -0,0 +1,18 @@
+if(NOT TARGET ndarray)
+    return()
+endif()
+
+add_executable(ndarray_test ndarray_test.cpp)
+
+target_link_libraries(ndarray_test PRIVATE ndarray)
+
+nnfw_find_package(GTest)
+if(NOT GTest_FOUND)
+    message(STATUS "GTest not avaialble. Skipping NDArray test build")
+    return()
+endif(NOT GTest_FOUND)
+
+target_link_libraries(ndarray_test PUBLIC gtest gtest_main ${LIB_PTHREAD})
+
+add_test(ndarray_test ndarray_test)
+install(TARGETS ndarray_test DESTINATION unittest_standalone)
diff --git a/runtime/libs/ndarray/test/ndarray_test.cpp b/runtime/libs/ndarray/test/ndarray_test.cpp
new file mode 100644
index 000000000..4b5ad5765
--- /dev/null
+++ b/runtime/libs/ndarray/test/ndarray_test.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+
+#include "ndarray/Array.h"
+
+using namespace ndarray;
+
+TEST(NDArray_tests, basic_data_test)
+{
+
+  float raw_data[] = {1, 2, 3, 4};
+
+  Array<float> data22{raw_data, {2, 2}};
+
+  ASSERT_FLOAT_EQ(data22.at(0, 0), 1);
+  ASSERT_FLOAT_EQ(data22.at(0, 1), 2);
+  ASSERT_FLOAT_EQ(data22.at(1, 0), 3);
+  ASSERT_FLOAT_EQ(data22.at(1, 1), 4);
+  ASSERT_EQ(data22.shape().rank(), 2);
+  ASSERT_EQ(data22.shape().dim(0), 2);
+  ASSERT_EQ(data22.shape().dim(1), 2);
+
+  Array<float> data14{raw_data, {1, 4}};
+  ASSERT_FLOAT_EQ(data14.at(0, 0), 1);
+  ASSERT_FLOAT_EQ(data14.at(0, 1), 2);
+  ASSERT_FLOAT_EQ(data14.at(0, 2), 3);
+  ASSERT_FLOAT_EQ(data14.at(0, 3), 4);
+  ASSERT_EQ(data14.shape().rank(), 2);
+  ASSERT_EQ(data14.shape().dim(0), 1);
+  ASSERT_EQ(data14.shape().dim(1), 4);
+
+  ContiguousSpan<float> cs = data22.flat();
+  ASSERT_EQ(cs.size(), 4);
+  ASSERT_FLOAT_EQ(cs.at(3), 4);
+
+  Array<float> lv = std::move(data14);
+  ASSERT_FLOAT_EQ(lv.at(0, 0), 1);
+  ASSERT_FLOAT_EQ(lv.at(0, 1), 2);
+  ASSERT_FLOAT_EQ(lv.at(0, 2), 3);
+  ASSERT_FLOAT_EQ(lv.at(0, 3), 4);
+}
+
+TEST(NDArray_tests, slice_write_test)
+{
+  float raw_data[4] = {0};
+
+  Array<float> data22{raw_data, {2, 2}};
+
+  data22.slice(1) = {1, 2};
+
+  ASSERT_FLOAT_EQ(data22.at(0, 0), 0);
+  ASSERT_FLOAT_EQ(data22.at(0, 1), 0);
+  ASSERT_FLOAT_EQ(data22.at(1, 0), 1);
+  ASSERT_FLOAT_EQ(data22.at(1, 1), 2);
+}
+
+TEST(NDArray_tests, slice_read_test)
+{
+  float raw_data[4] = {1, 2, 3, 4};
+
+  Array<float> data22{raw_data, {2, 2}};
+
+  auto slice = data22.slice(1);
+
+  ASSERT_FLOAT_EQ(slice[0], 3);
+  ASSERT_FLOAT_EQ(slice[1], 4);
+}
+
+TEST(NDArray_tests, multidim_test)
+{
+  float raw_data[5] = {0, 1, 2, 3, 4};
+
+  Array<float> data22{raw_data, {1, 1, 1, 1, 5}};
+
+  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 0), 0);
+  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 1), 1);
+  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 2), 2);
+  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 3), 3);
+  ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 4), 4);
+}
+
+TEST(NDArray_tests, slice_assign_test)
+{
+  std::vector<float> v1{1, 2, 3, 4, 5};
+  std::vector<float> v2(5);
+
+  ContiguousSpan<float> span1(v1.begin(), v1.end());
+  ContiguousSpan<float> span2(v2.begin(), v2.end());
+
+  span2.assign(span1);
+
+  ASSERT_EQ(v1, v2);
+  ASSERT_EQ(span1.size(), 5);
+  ASSERT_EQ(span2.size(), 5);
+
+  ASSERT_EQ(span2.at(2), 3);
+  ASSERT_EQ(span2.at(4), 5);
+
+  ASSERT_EQ(*(span1.data() + 2), *(span1.data() + 2));
+
+  ContiguousSpan<float> span3(span2.offset(1));
+  ASSERT_EQ(span3.size(), 4);
+  ASSERT_EQ(span3.at(0), 2);
+  ASSERT_EQ(span3.at(1), 3);
+  ASSERT_EQ(span3.at(2), 4);
+  ASSERT_EQ(span3.at(3), 5);
+}
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 6624ae676..4fce291a0 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01001100
+#define NNFW_VERSION 0x01001200
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt
index 1e5443263..b61e58251 100644
--- a/runtime/onert/backend/cpu/CMakeLists.txt
+++ b/runtime/onert/backend/cpu/CMakeLists.txt
@@ -12,6 +12,7 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_common)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
+target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
 
 set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES OUTPUT_NAME backend_cpu)
 
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 59fb68d55..75274dc88 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -35,6 +35,7 @@
 #include "ops/GatherLayer.h"
 #include "ops/LSTMLayer.h"
 #include "ops/MeanLayer.h"
+#include "ops/DetectionPostProcessLayer.h"
 #include "ops/OneHotLayer.h"
 #include "ops/OperationUtils.h"
 #include "ops/PackLayer.h"
@@ -1177,6 +1178,51 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
   _return_fn = std::move(fn);
 }
 
+void KernelGenerator::visit(const ir::operation::DetectionPostProcess &node)
+{
+  using NMS = ir::operation::DetectionPostProcess;
+
+  ops::DetectionPostProcessLayer::DetectionPostProcessParameters parameters;
+  parameters.scales.y = node.param().scale.y_scale;
+  parameters.scales.x = node.param().scale.x_scale;
+  parameters.scales.w = node.param().scale.w_scale;
+  parameters.scales.h = node.param().scale.h_scale;
+
+  parameters.iou_threshold = node.param().iou_threshold;
+  parameters.score_threshold = node.param().score_threshold;
+  parameters.max_boxes_per_class = node.param().max_boxes_per_class;
+  parameters.max_detections = node.param().max_detections;
+  parameters.num_classes = node.param().num_classes;
+  parameters.center_box_format = node.param().center_size_boxes;
+  parameters.max_classes_per_detection = node.param().max_classes_per_detection;
+
+  auto boxes_index = node.getInputs().at(NMS::Input::BOXES);
+  auto scores_index = node.getInputs().at(NMS::Input::SCORES);
+  auto anchors_index = node.getInputs().at(NMS::Input::INPUT_ANCHORS);
+
+  auto o_classes_index = node.getOutputs().at(NMS::Output::BOX_CLASSES);
+  auto o_coords_index = node.getOutputs().at(NMS::Output::BOX_COORDS);
+  auto o_scores_index = node.getOutputs().at(NMS::Output::BOX_SCORES);
+  auto o_num_selected_index = node.getOutputs().at(NMS::Output::NUM_SELECTED);
+
+  parameters.boxes_descr = _ctx.at(boxes_index).shape().dims();
+  parameters.scrores_descr = _ctx.at(scores_index).shape().dims();
+
+  parameters.boxes_input = _tensor_reg->getPortableTensor(boxes_index);
+  parameters.scores_input = _tensor_reg->getPortableTensor(scores_index);
+  parameters.anchors_input = _tensor_reg->getPortableTensor(anchors_index);
+
+  parameters.box_classes_output = _tensor_reg->getPortableTensor(o_classes_index);
+  parameters.box_coords_output = _tensor_reg->getPortableTensor(o_coords_index);
+  parameters.box_scores_output = _tensor_reg->getPortableTensor(o_scores_index);
+  parameters.num_selections_output = _tensor_reg->getPortableTensor(o_num_selected_index);
+
+  auto fn = std::make_unique<ops::DetectionPostProcessLayer>();
+  fn->configure(std::move(parameters));
+
+  _return_fn = std::move(fn);
+}
+
 void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
 {
   const auto output_index{node.getOutputs().at(0)};
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index d452d0ba6..d7d5fe6fc 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -69,6 +69,7 @@ public:
   void visit(const ir::operation::LogSoftmax &) override;
   void visit(const ir::operation::LSTM &) override;
   void visit(const ir::operation::MatrixBandPart &) override;
+  void visit(const ir::operation::DetectionPostProcess &) override;
   void visit(const ir::operation::OneHot &) override;
   void visit(const ir::operation::Pack &) override;
   void visit(const ir::operation::Pad &) override;
diff --git a/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.cc b/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.cc
new file mode 100644
index 000000000..8a6fe6504
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.cc
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DetectionPostProcessLayer.h"
+
+#include "ndarray/Array.h"
+
+#include <numeric>
+#include <utility>
+#include <cmath>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+namespace
+{
+
+using namespace ndarray;
+
+using CenterSizeBox = DetectionPostProcessLayer::CenterSizeBox;
+using CornerBox = DetectionPostProcessLayer::CornerBox;
+
+using NonMaxSuppressionParam = DetectionPostProcessLayer::DetectionPostProcessParameters;
+using Allocations = DetectionPostProcessLayer::Allocations;
+
+struct OutputArrays
+{
+  OutputArrays(CornerBox *coords_buf, float *scores_buf, float *classes_buf,
+               int *num_selections_buf, size_t max_detections)
+    : coords(coords_buf, {max_detections}), scores(scores_buf, {max_detections}),
+      classes(classes_buf, {max_detections}), num_selections(num_selections_buf, {1})
+  {
+  }
+
+  Array<CornerBox> coords;
+  Array<float> scores;
+  Array<float> classes;
+  Array<int> num_selections;
+};
+
+struct TemporaryArrays
+{
+  TemporaryArrays(int *selections_buffer, int max_detections)
+    : selections(selections_buffer, {static_cast<unsigned long>(max_detections)})
+  {
+  }
+
+  Array<int> selections;
+};
+
+// sort indices in decreasing order of first `k` scores
+void PartialArgSort(const ContiguousSpan<float, true> &scores,
+                    const ContiguousSpan<int, false> &indices, int k)
+{
+  std::iota(indices.begin(), indices.begin() + k, 0);
+  std::partial_sort(indices.begin(), indices.begin() + k, indices.begin() + scores.size(),
+                    [&scores](const int i, const int j) { return scores[i] > scores[j]; });
+}
+
+template <typename T> ContiguousSpan<T, false> static vecToSpan(std::vector<T> &v)
+{
+  return ContiguousSpan<T, false>{v.begin(), v.end()};
+}
+
+Array<const CornerBox> decodeBoxes(const Array<float> &raw_boxes, const Array<float> &raw_anchors,
+                                   bool center_box_format, const CenterSizeBox &scales)
+{
+  auto nbatches = raw_boxes.shape().dim(0);
+  auto num_boxes = raw_boxes.shape().dim(1);
+
+  auto anchors = array_cast<const CenterSizeBox>(raw_anchors, {num_boxes});
+
+  if (!center_box_format)
+  {
+    auto boxes_p = reinterpret_cast<const CornerBox *>(raw_boxes.flat().data());
+    return {boxes_p, {num_boxes}};
+  }
+  else
+  {
+    // TODO support box center-width encoding correctly
+    // i.e anchors
+    auto boxes_p = reinterpret_cast<const CenterSizeBox *>(raw_boxes.flat().data());
+    Array<const CenterSizeBox> in_boxes{boxes_p, {num_boxes}};
+
+    auto decoded_boxes_p = new CornerBox[nbatches * num_boxes];
+    Array<CornerBox> decoded_boxes_a{decoded_boxes_p, {num_boxes}};
+
+    for (size_t i = 0; i < num_boxes; ++i)
+    {
+      auto anchor = anchors.at(i);
+      auto &box = decoded_boxes_a.at(i);
+      float yc = in_boxes.at(i).y / scales.y * anchor.h + anchor.y;
+      float xc = in_boxes.at(i).x / scales.x * anchor.w + anchor.x;
+      float halfh = 0.5f * std::exp(in_boxes.at(i).h / scales.h) * anchor.h;
+      float halfw = 0.5f * std::exp(in_boxes.at(i).w / scales.w) * anchor.w;
+      box.x1 = xc - halfw;
+      box.x2 = xc + halfw;
+      box.y1 = yc - halfh;
+      box.y2 = yc + halfh;
+
+      assert(box.x2 > box.x1);
+      assert(box.y2 > box.y1);
+    }
+
+    return array_cast<const CornerBox>(std::move(decoded_boxes_a), decoded_boxes_a.shape());
+  }
+}
+
+float computeIOU(const CornerBox &box1, const CornerBox &box2)
+{
+  float area_i = (box1.y2 - box1.y1) * (box1.x2 - box1.x1);
+  float area_j = (box2.y2 - box2.y1) * (box2.x2 - box2.x1);
+  if (area_i <= 0 || area_j <= 0)
+  {
+    return 0.0;
+  }
+  float in_ymin = std::max<float>(box1.y1, box2.y1);
+  float in_xmin = std::max<float>(box1.x1, box2.x1);
+  float in_ymax = std::min<float>(box1.y2, box2.y2);
+  float in_xmax = std::min<float>(box1.x2, box2.x2);
+  float in_area = std::max<float>(in_ymax - in_ymin, 0.0) * std::max<float>(in_xmax - in_xmin, 0.0);
+
+  return in_area / (area_i + area_j - in_area);
+}
+
+int doSingleClass(const Array<const CornerBox> &boxes, const std::vector<float> &scores,
+                  const NonMaxSuppressionParam &param, TemporaryArrays &temps,
+                  size_t max_detections)
+{
+  auto num_boxes = boxes.shape().dim(0);
+
+  std::vector<int> sorted_box_indices(num_boxes);
+  PartialArgSort(ContiguousSpan<float, true>(scores.data(), num_boxes),
+                 vecToSpan(sorted_box_indices), num_boxes);
+
+  // TODO move to temp allocations
+  std::vector<int> process_box(num_boxes, 1);
+
+  size_t selected_count = 0;
+  for (size_t i = 0; i < num_boxes; ++i)
+  {
+    auto box_index = sorted_box_indices[i];
+
+    if (!process_box[box_index] || scores[box_index] < param.score_threshold)
+    {
+      continue;
+    }
+
+    temps.selections.at(selected_count) = box_index;
+    selected_count++;
+
+    if (selected_count >= max_detections)
+    {
+      break;
+    }
+
+    for (size_t j = i + 1; j < num_boxes; ++j)
+    {
+      if (!process_box[sorted_box_indices[j]])
+      {
+        continue;
+      }
+
+      float IOU = computeIOU(boxes.at(box_index), boxes.at(sorted_box_indices[j]));
+      if (IOU > param.iou_threshold)
+      {
+        process_box[sorted_box_indices[j]] = 0;
+      }
+    }
+  }
+
+  return selected_count;
+}
+
+void collectBoxes(TemporaryArrays &temporary, const Array<const CornerBox> &decoded_boxes,
+                  std::vector<float> &scores, int num_selected, OutputArrays &output,
+                  const Array<int> &sorted_classes, int detections_per_box)
+{
+  auto &selections = temporary.selections;
+
+  size_t output_box_count = 0;
+
+  for (int i = 0; i < num_selected; ++i)
+  {
+    int selected_box = selections.at(output_box_count);
+
+    for (int c = 0; c < detections_per_box; ++c)
+    {
+      output.classes.at(output_box_count) = sorted_classes.at(selected_box, c);
+      output.scores.at(output_box_count) = scores[selected_box];
+      output.coords.at(output_box_count) = decoded_boxes.at(selected_box);
+      output_box_count++;
+    }
+  }
+}
+
+void DetectionPostProcess(const Array<float> &boxes_a, const Array<float> &scores_a,
+                          Array<float> &num_selected_a, const NonMaxSuppressionParam &param,
+                          const Allocations &allocations, OutputArrays &outputs)
+{
+  TemporaryArrays temporary(allocations.selections_buffer, param.max_detections);
+
+  // Only batch of 1 is supported atm
+  auto num_boxes = boxes_a.shape().dim(1);
+  size_t num_classes = param.num_classes;
+  size_t num_classes_with_background = scores_a.shape().dim(2);
+  bool have_background = num_classes_with_background != num_classes;
+
+  size_t max_classes_per_box = std::min<size_t>(num_classes, param.max_classes_per_detection);
+
+  // TODO move this to allocations
+  std::vector<int> sorted_class_indices(num_boxes * num_classes);
+
+  Array<int> class_indices(sorted_class_indices.data(), {num_boxes, num_classes});
+
+  // TODO move to allocations
+  std::vector<float> max_scores(num_boxes);
+
+  for (size_t row = 0; row < num_boxes; row++)
+  {
+    auto box_scores = scores_a.slice(0, row).offset(have_background ? 1 : 0);
+    auto indices = class_indices.slice(row);
+
+    PartialArgSort(box_scores, indices, num_classes);
+
+    max_scores[row] = box_scores[indices[0]];
+  }
+
+  auto anchors_a =
+    Array<float>(reinterpret_cast<float *>(param.anchors_input->buffer()), {num_boxes, 4});
+  auto decoded_boxes = decodeBoxes(boxes_a, anchors_a, param.center_box_format, param.scales);
+
+  int num_selected =
+    doSingleClass(decoded_boxes, max_scores, param, temporary, param.max_detections);
+
+  collectBoxes(temporary, decoded_boxes, max_scores, num_selected, outputs, class_indices,
+               max_classes_per_box);
+
+  num_selected_a.at(0) = num_selected;
+}
+} // namespace
+
+template <typename T> Array<T> toArray(uint8_t *ptr, std::vector<int32_t> &descr)
+{
+  ndarray::Shape shape(descr.size());
+  for (size_t i = 0; i < descr.size(); ++i)
+  {
+    shape.dim(i) = descr[i];
+  }
+
+  return Array<T>{reinterpret_cast<T *>(ptr), shape};
+}
+
+void DetectionPostProcessLayer::configure(DetectionPostProcessParameters parameters)
+{
+  _parameters = std::move(parameters);
+  _allocations.selections_buffer = new int[_parameters.max_detections * 2];
+}
+
+void DetectionPostProcessLayer::run()
+{
+  auto nbatches = (unsigned int)_parameters.boxes_descr[0];
+  // no suport for batch other than 1( it's fine since tflite does not support
+  // batch for postprocess either )
+  assert(nbatches == 1);
+
+  auto boxes_a = toArray<float>(_parameters.boxes_input->buffer(), _parameters.boxes_descr);
+  auto scores_a = toArray<float>(_parameters.scores_input->buffer(), _parameters.scrores_descr);
+
+  auto num_selected_a = ndarray::Array<float>(
+    reinterpret_cast<float *>(_parameters.num_selections_output->buffer()), {nbatches});
+
+  OutputArrays outputArrays(reinterpret_cast<CornerBox *>(_parameters.box_coords_output->buffer()),
+                            reinterpret_cast<float *>(_parameters.box_scores_output->buffer()),
+                            reinterpret_cast<float *>(_parameters.box_classes_output->buffer()),
+                            reinterpret_cast<int *>(_parameters.num_selections_output->buffer()),
+                            _parameters.max_detections);
+
+  DetectionPostProcess(boxes_a, scores_a, num_selected_a, _parameters, _allocations, outputArrays);
+}
+
+DetectionPostProcessLayer::~DetectionPostProcessLayer() { delete[] _allocations.selections_buffer; }
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.h b/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.h
new file mode 100644
index 000000000..836a70cac
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/DetectionPostProcessLayer.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_DPP_H__
+#define __ONERT_BACKEND_CPU_OPS_DPP_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class DetectionPostProcessLayer : public ::onert::exec::IFunction
+{
+public:
+  struct CornerBox
+  {
+    float y1, x1;
+    float y2, x2;
+  };
+
+  struct CenterSizeBox
+  {
+    float y, x;
+    float h, w;
+  };
+
+  struct DetectionPostProcessParameters
+  {
+    const IPortableTensor *boxes_input;
+    const IPortableTensor *scores_input;
+    const IPortableTensor *anchors_input;
+    IPortableTensor *box_coords_output;
+    IPortableTensor *box_classes_output;
+    IPortableTensor *box_scores_output;
+    IPortableTensor *num_selections_output;
+    std::vector<int32_t> boxes_descr;
+    std::vector<int32_t> scrores_descr;
+
+    uint32_t max_detections;
+    float score_threshold;
+    float iou_threshold; // intersection-over-union
+    uint32_t max_boxes_per_class;
+    bool center_box_format = false;
+    int32_t num_classes;
+    int32_t max_classes_per_detection;
+    CenterSizeBox scales;
+  };
+
+  enum SelectionFormat
+  {
+    BOX_INDEX = 1,
+    CLASS_INDEX = 0
+  };
+
+  struct Allocations
+  {
+    int *selections_buffer = nullptr;
+    // TODO move all dynamic allocations here, and into configure phase
+  };
+
+  DetectionPostProcessLayer() : _parameters{}
+  {
+    // DO NOTHING
+  }
+
+  virtual ~DetectionPostProcessLayer();
+
+public:
+  void configure(DetectionPostProcessParameters parameters);
+
+  void run() override;
+
+private:
+  DetectionPostProcessParameters _parameters;
+
+  Allocations _allocations;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_DPP_H__
diff --git a/runtime/onert/core/include/compiler/StaticShapeInferer.h b/runtime/onert/core/include/compiler/StaticShapeInferer.h
index 2e484e649..b2272e262 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInferer.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInferer.h
@@ -112,6 +112,7 @@ private:
   void visit(const ir::operation::Transpose &op) override;
   void visit(const ir::operation::Unpack &op) override;
   void visit(const ir::operation::While &op) override;
+  void visit(const ir::operation::DetectionPostProcess &op) override;
 
 private:
   /**
diff --git a/runtime/onert/core/include/exec/DynamicShapeInferer.h b/runtime/onert/core/include/exec/DynamicShapeInferer.h
index 3d040e2cc..f814b789a 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInferer.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInferer.h
@@ -67,6 +67,7 @@ public:
   void visit(const ir::operation::L2Normalization &op) override;
   void visit(const ir::operation::LSTM &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
+  void visit(const ir::operation::DetectionPostProcess &op) override;
   void visit(const ir::operation::OneHot &op) override;
   void visit(const ir::operation::Pack &op) override;
   void visit(const ir::operation::Pad &op) override;
diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
index 45fadc474..0eb45e1ee 100644
--- a/runtime/onert/core/include/ir/Operations.Include.h
+++ b/runtime/onert/core/include/ir/Operations.Include.h
@@ -50,6 +50,7 @@
 #include "ir/operation/LogSoftmax.h"
 #include "ir/operation/LSTM.h"
 #include "ir/operation/MatrixBandPart.h"
+#include "ir/operation/DetectionPostProcess.h"
 #include "ir/operation/OneHot.h"
 #include "ir/operation/Pack.h"
 #include "ir/operation/Pad.h"
diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
index 7f3c40b4b..f17fdfdd7 100644
--- a/runtime/onert/core/include/ir/Operations.lst
+++ b/runtime/onert/core/include/ir/Operations.lst
@@ -53,6 +53,7 @@ OP(LocalResponseNormalization)
 OP(LogSoftmax)
 OP(LSTM)
 OP(MatrixBandPart)
+OP(DetectionPostProcess)
 OP(OneHot)
 OP(Pack)
 OP(Pad)
diff --git a/runtime/onert/core/include/ir/operation/DetectionPostProcess.h b/runtime/onert/core/include/ir/operation/DetectionPostProcess.h
new file mode 100644
index 000000000..becb0e21a
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/DetectionPostProcess.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_MODEL_OPERATION_DETECTION_POST_PROCESS_NODE_H__
+#define __NEURUN_MODEL_OPERATION_DETECTION_POST_PROCESS_NODE_H__
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+class DetectionPostProcess : public Operation
+{
+public:
+  enum Input
+  {
+    BOXES = 0,
+    SCORES = 1,
+    INPUT_ANCHORS = 2
+  };
+
+  enum Output
+  {
+    BOX_COORDS = 0,
+    BOX_CLASSES = 1,
+    BOX_SCORES = 2,
+    NUM_SELECTED = 3
+  };
+
+  struct Scale
+  {
+    float y_scale;
+    float x_scale;
+    float h_scale;
+    float w_scale;
+  };
+
+  struct Param
+  {
+    int max_detections;
+    float score_threshold;
+    float iou_threshold; // intersection-over-union
+    int max_boxes_per_class;
+    int32_t num_classes;
+    int32_t max_classes_per_detection;
+    // N*N complexity instead of N*N*M, where N - number of boxes and M number of classes
+    bool center_size_boxes;
+    bool do_fast_eval = true;
+    Scale scale;
+  };
+
+public:
+  DetectionPostProcess(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
+                       const Param &param);
+
+public:
+  void accept(OperationVisitor &v) const override;
+
+  std::string getName() const { return "DetectionPostProcess"; }
+
+public:
+  const Param &param() const { return _param; }
+  OpCode opcode() const final { return OpCode::DetectionPostProcess; }
+
+private:
+  Param _param;
+};
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
+
+#endif // __NEURUN_MODEL_OPERATION_DETECTION_POST_PROCESS_NODE_H__
diff --git a/runtime/onert/core/src/compiler/StaticShapeInferer.cc b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
index 5849a9801..f2fee2c3c 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInferer.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
@@ -1302,6 +1302,30 @@ void StaticShapeInferer::visit(const ir::operation::While &op)
   }
 }
 
+void StaticShapeInferer::visit(const ir::operation::DetectionPostProcess &op)
+{
+  // TODO: NMS supports very limited input/output size.
+  ir::operation::DetectionPostProcess::Param param = op.param();
+
+  const int num_detected_boxes = param.max_detections * param.max_classes_per_detection;
+
+  const auto output_idx1 = op.getOutputs().at(0);
+  auto &output1 = _operands.at(output_idx1);
+  output1.info().shape({1, num_detected_boxes, 4});
+
+  const auto output_idx2 = op.getOutputs().at(1);
+  auto &output2 = _operands.at(output_idx2);
+  output2.info().shape({1, num_detected_boxes});
+
+  const auto output_idx3 = op.getOutputs().at(2);
+  auto &output3 = _operands.at(output_idx3);
+  output3.info().shape({1, num_detected_boxes});
+
+  const auto output_idx4 = op.getOutputs().at(3);
+  auto &output4 = _operands.at(output_idx4);
+  output4.info().shape({1});
+}
+
 } // namespace compiler
 
 } // namespace onert
diff --git a/runtime/onert/core/src/exec/DynamicShapeInferer.cc b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
index dbf4eb28f..fb8058d23 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInferer.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
@@ -601,6 +601,14 @@ void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
 }
 
+void DynamicShapeInferer::visit(const ir::operation::DetectionPostProcess & /* op */)
+{
+  // NOTE DetectionPostProcess's undefined outputs' shape are decided on compile time
+  //      by static shape inferer.
+  //      DetectionPostProcess's outputs' shape are independent with input shape
+  //      and decided by parameter value.
+}
+
 void DynamicShapeInferer::visit(const ir::operation::OneHot &op)
 {
   auto output_ind = op.getOutputs().at(0);
diff --git a/runtime/onert/core/src/ir/OperationValidator.cc b/runtime/onert/core/src/ir/OperationValidator.cc
index 705a37e2c..094dbc0d5 100644
--- a/runtime/onert/core/src/ir/OperationValidator.cc
+++ b/runtime/onert/core/src/ir/OperationValidator.cc
@@ -211,6 +211,14 @@ void OperationValidator::visit(const operation::DepthToSpace &node)
   OP_REQUIRES(block_size > 0);
 }
 
+void OperationValidator::visit(const operation::DetectionPostProcess &node)
+{
+  auto param = node.param();
+
+  // FIXME: number of classes should be 1 for now.
+  OP_REQUIRES(param.num_classes == 1);
+}
+
 void OperationValidator::visit(const operation::DepthwiseConv2D &node)
 {
   const auto input_index{node.getInputs().at(operation::DepthwiseConv2D::Input::INPUT)};
diff --git a/runtime/onert/core/src/ir/OperationValidator.h b/runtime/onert/core/src/ir/OperationValidator.h
index 9829ca095..b9bcc4ee8 100644
--- a/runtime/onert/core/src/ir/OperationValidator.h
+++ b/runtime/onert/core/src/ir/OperationValidator.h
@@ -55,6 +55,7 @@ public:
   void visit(const operation::Conv2D &node) override;
   void visit(const operation::DepthToSpace &node) override;
   void visit(const operation::DepthwiseConv2D &node) override;
+  void visit(const operation::DetectionPostProcess &node) override;
   void visit(const operation::ElementwiseActivation &node) override;
   void visit(const operation::ElementwiseBinary &node) override;
   void visit(const operation::ElementwiseUnary &node) override;
diff --git a/runtime/onert/core/src/ir/operation/DetectionPostProcess.cc b/runtime/onert/core/src/ir/operation/DetectionPostProcess.cc
new file mode 100644
index 000000000..cd708796d
--- /dev/null
+++ b/runtime/onert/core/src/ir/operation/DetectionPostProcess.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/operation/DetectionPostProcess.h"
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+DetectionPostProcess::DetectionPostProcess(const OperandIndexSequence &inputs,
+                                           const OperandIndexSequence &outputs, const Param &param)
+  : Operation(OperandConstraint::createExact(3u), inputs, outputs), _param(param)
+{
+}
+
+void DetectionPostProcess::accept(OperationVisitor &v) const { v.visit(*this); }
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index c444e7365..6ba7ee922 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -142,6 +142,7 @@ private:
   void loadIf(const Operator *op, ir::Graph &subg);
   void loadLeakyRelu(const Operator *op, ir::Graph &subg);
   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
+  void loadDetectionPostProcess(const Operator *op, ir::Graph &subg);
   void loadOneHot(const Operator *op, ir::Graph &subg);
   void loadPack(const Operator *op, ir::Graph &subg);
   void loadPool2D(const Operator *op, ir::Graph &subg, ir::operation::Pool2D::PoolType op_type);
@@ -928,6 +929,45 @@ void BaseLoader<LoaderDomain>::loadGather(const Operator *op, ir::Graph &subg)
 }
 
 template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadDetectionPostProcess(const Operator *op, ir::Graph &subg)
+{
+  const flexbuffers::Map &m =
+    flexbuffers::GetRoot(op->custom_options()->data(), op->custom_options()->size()).AsMap();
+
+  ir::operation::DetectionPostProcess::Param param;
+
+  param.max_detections = m["max_detections"].AsInt32();
+
+  // TODO fixme
+  param.max_classes_per_detection = m["max_classes_per_detection"].AsInt32();
+  if (m["detections_per_class"].IsNull())
+    param.max_boxes_per_class = 100;
+  else
+    param.max_boxes_per_class = m["detections_per_class"].AsInt32();
+
+  if (m["use_regular_nms"].IsNull())
+    param.do_fast_eval = true;
+  else
+    param.do_fast_eval = !m["use_regular_nms"].AsBool();
+
+  param.score_threshold = m["nms_score_threshold"].AsFloat();
+  param.iou_threshold = m["nms_iou_threshold"].AsFloat();
+
+  // TODO add num classes support
+  param.num_classes = m["num_classes"].AsInt32();
+
+  param.scale.y_scale = m["y_scale"].AsFloat();
+  param.scale.x_scale = m["x_scale"].AsFloat();
+  param.scale.h_scale = m["h_scale"].AsFloat();
+  param.scale.w_scale = m["w_scale"].AsFloat();
+
+  // TODO depends on input model framework
+  param.center_size_boxes = true;
+
+  loadOperationTo<ir::operation::DetectionPostProcess>(op, subg, param);
+}
+
+template <typename LoaderDomain>
 void BaseLoader<LoaderDomain>::loadBatchMatMul(const Operator *op, ir::Graph &subg)
 {
   ir::operation::BatchMatMul::Param param;
@@ -997,7 +1037,8 @@ void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
     BroadcastTo,
     FusedBatchNorm,
     StatelessRandomUniform,
-    Erf
+    Erf,
+    DetectionPostProcess
   };
 
   // Mapping from custom op name string to BuiltinOP enum
@@ -1011,6 +1052,7 @@ void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
     {"BroadcastTo", BuiltinOP::BroadcastTo},
     {"StatelessRandomUniform", BuiltinOP::StatelessRandomUniform},
     {"Erf", BuiltinOP::Erf},
+    {"TFLite_Detection_PostProcess", BuiltinOP::DetectionPostProcess},
   };
 
   try
@@ -1046,6 +1088,9 @@ void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg)
       case BuiltinOP::Erf:
         loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ERF);
         break;
+      case BuiltinOP::DetectionPostProcess:
+        loadDetectionPostProcess(op, subg);
+        break;
       default:
         throw std::runtime_error{
           "Loader: Custom OP map is defined but operation loader function is not defined"};
diff --git a/tests/nnfw_api/src/CircleGen.cc b/tests/nnfw_api/src/CircleGen.cc
index 579d68c86..0ffc8fb44 100644
--- a/tests/nnfw_api/src/CircleGen.cc
+++ b/tests/nnfw_api/src/CircleGen.cc
@@ -15,6 +15,7 @@
  */
 
 #include "CircleGen.h"
+#include "flatbuffers/flexbuffers.h"
 
 CircleGen::CircleGen() : _subgraph_contexts(1) // Create primary subgraph
 {
@@ -189,6 +190,35 @@ uint32_t CircleGen::addOperatorDepthwiseConv2D(const OperatorParams &params,
                                 circle::BuiltinOptions_DepthwiseConv2DOptions, options);
 }
 
+uint32_t CircleGen::addOperatorDetectionPostProcess(const OperatorParams &params, int num_classes,
+                                                    float y_scale, float x_scale, float h_scale,
+                                                    float w_scale, float nms_score_threshold,
+                                                    float nms_iou_threshold, int max_detections,
+                                                    int max_classes_per_detection,
+                                                    int detections_per_class)
+{
+  // flexbuffer custom_option
+  auto flex_buffers = std::make_unique<flexbuffers::Builder>();
+  size_t map_start = flex_buffers->StartMap();
+  flex_buffers->Int("num_classes", num_classes);
+  flex_buffers->Float("y_scale", y_scale);
+  flex_buffers->Float("x_scale", x_scale);
+  flex_buffers->Float("h_scale", h_scale);
+  flex_buffers->Float("w_scale", w_scale);
+  flex_buffers->Float("nms_iou_threshold", nms_iou_threshold);
+  flex_buffers->Float("nms_score_threshold", nms_score_threshold);
+  flex_buffers->Int("max_detections", max_detections);
+  flex_buffers->Int("max_classes_per_detection", max_classes_per_detection);
+  flex_buffers->Int("detections_per_class", detections_per_class);
+  flex_buffers->EndMap(map_start);
+  flex_buffers->Finish();
+
+  return addCustomOperatorWithOptions(params, "TFLite_Detection_PostProcess",
+                                      circle::BuiltinOptions_NONE, 0, &flex_buffers->GetBuffer(),
+                                      circle::CustomOptionsFormat::CustomOptionsFormat_FLEXBUFFERS,
+                                      nullptr, nullptr);
+}
+
 uint32_t CircleGen::addOperatorElu(const OperatorParams &params)
 {
   return addOperatorWithOptions(params, circle::BuiltinOperator_ELU, circle::BuiltinOptions_NONE,
@@ -523,6 +553,23 @@ uint32_t CircleGen::addOperatorWithOptions(const OperatorParams &params,
   return ind;
 }
 
+uint32_t CircleGen::addCustomOperatorWithOptions(
+  const OperatorParams &params, std::string custom_code, circle::BuiltinOptions options_type,
+  flatbuffers::Offset<void> options, const std::vector<uint8_t> *custom_options,
+  circle::CustomOptionsFormat custom_options_format,
+  const std::vector<uint8_t> *mutating_variable_inputs, const std::vector<int32_t> *intermediates)
+
+{
+  uint32_t opcode_ind = addCustomOperatorCode(custom_code);
+  auto op = circle::CreateOperatorDirect(
+    _fbb, opcode_ind, &params.inputs, &params.outputs, options_type, options, custom_options,
+    custom_options_format, mutating_variable_inputs, intermediates);
+
+  uint32_t ind = curSubgCtx().operators.size();
+  curSubgCtx().operators.emplace_back(op);
+  return ind;
+}
+
 uint32_t CircleGen::addOperatorCode(circle::BuiltinOperator opcode)
 {
   // TODO If the same OperatorCode is registered already, just return it
@@ -531,6 +578,15 @@ uint32_t CircleGen::addOperatorCode(circle::BuiltinOperator opcode)
   return ind;
 }
 
+uint32_t CircleGen::addCustomOperatorCode(std::string custom_code)
+{
+  // TODO If the same OperatorCode is registered already, just return it
+  uint32_t ind = _opcodes.size();
+  _opcodes.emplace_back(
+    circle::CreateOperatorCodeDirect(_fbb, circle::BuiltinOperator_CUSTOM, custom_code.c_str()));
+  return ind;
+}
+
 flatbuffers::Offset<circle::Buffer> CircleGen::buildBuffer(const uint8_t *buf, size_t size)
 {
   if (buf == nullptr && size == 0)
diff --git a/tests/nnfw_api/src/CircleGen.h b/tests/nnfw_api/src/CircleGen.h
index ab7707d5a..f6f799668 100644
--- a/tests/nnfw_api/src/CircleGen.h
+++ b/tests/nnfw_api/src/CircleGen.h
@@ -159,6 +159,11 @@ public:
                                       int stride_w, int stride_h, int depth_multiplier,
                                       circle::ActivationFunctionType actfn, int dilation_w = 1,
                                       int dilation_h = 1);
+  uint32_t addOperatorDetectionPostProcess(const OperatorParams &params, int num_classes,
+                                           float y_scale, float x_scale, float h_scale,
+                                           float w_scale, float nms_score_threshold,
+                                           float nms_iou_threshold, int max_detections,
+                                           int max_classes_per_detection, int detections_per_class);
   uint32_t addOperatorElu(const OperatorParams &params);
   uint32_t addOperatorEqual(const OperatorParams &params);
   uint32_t addOperatorExpandDims(const OperatorParams &params);
@@ -220,7 +225,15 @@ private:
   uint32_t addOperatorWithOptions(const OperatorParams &params, circle::BuiltinOperator opcode,
                                   circle::BuiltinOptions options_type,
                                   flatbuffers::Offset<void> options);
+  uint32_t addCustomOperatorWithOptions(const OperatorParams &params, std::string custom_code,
+                                        circle::BuiltinOptions options_type,
+                                        flatbuffers::Offset<void> options,
+                                        const std::vector<uint8_t> *custom_options,
+                                        circle::CustomOptionsFormat custom_options_format,
+                                        const std::vector<uint8_t> *mutating_variable_inputs,
+                                        const std::vector<int32_t> *intermediates);
   uint32_t addOperatorCode(circle::BuiltinOperator opcode);
+  uint32_t addCustomOperatorCode(std::string custom_code);
   flatbuffers::Offset<circle::Buffer> buildBuffer(const uint8_t *buf, size_t size);
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params);
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params, float scale,
diff --git a/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc b/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc
index 3df7e7403..dda098698 100644
--- a/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc
+++ b/tests/nnfw_api/src/one_op_tests/ArgMinMax.cc
@@ -36,28 +36,6 @@ class ArgMinMaxVariation : public GenModelTest,
 // Reduce axis: 1
 // Output shape: {1, 2, 1}
 // Output type: Int32
-TEST_P(ArgMinMaxVariation, Test)
-{
-  auto &param = GetParam();
-
-  CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_INT32;
-  std::vector<int32_t> axis_data{1};
-  uint32_t axis_buf = cgen.addBuffer(axis_data);
-  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
-  int out = cgen.addTensor({{1, 2, 1}, output_type});
-  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
-                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(param.tcd);
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-
-  SUCCEED();
-}
-
 // Test with different input type and value
 INSTANTIATE_TEST_CASE_P(
   GenModelTest, ArgMinMaxVariation,
@@ -93,6 +71,28 @@ INSTANTIATE_TEST_CASE_P(
       TestCaseData{}.addInput<int8_t>({1, 4, 2, 3}).addOutput<int32_t>({0, 1}), false,
       circle::TensorType::TensorType_INT8, 1.0, 1}));
 
+TEST_P(ArgMinMaxVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+  const auto output_type = circle::TensorType::TensorType_INT32;
+  std::vector<int32_t> axis_data{1};
+  uint32_t axis_buf = cgen.addBuffer(axis_data);
+  int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({{1, 2, 1}, output_type});
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, OneOp_ArgMax_Int64_AxisToConst)
 {
   CircleGen cgen;
@@ -132,35 +132,41 @@ TEST_F(GenModelTest, OneOp_ArgMax_AxisToVar)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis0)
+TEST_P(ArgMinMaxVariation, neg_InvalidAxis0)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
   const auto output_type = circle::TensorType::TensorType_INT32;
   std::vector<int32_t> axis_data{4};
   uint32_t axis_buf = cgen.addBuffer(axis_data);
   int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
   int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
   _context->expectFailCompile();
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_ArgMax_InvalidAxis1)
+TEST_P(ArgMinMaxVariation, neg_InvalidAxis1)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
   const auto output_type = circle::TensorType::TensorType_INT32;
   std::vector<int32_t> axis_data{-3};
   uint32_t axis_buf = cgen.addBuffer(axis_data);
   int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{2, 2}, param.input_type}, param.scale, param.zero_point);
   int out = cgen.addTensor({{2}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
@@ -188,16 +194,19 @@ TEST_F(GenModelTest, neg_OneOp_ArgMax_InType)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_ArgMax_AxisType)
+TEST_P(ArgMinMaxVariation, neg_AxisType)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  const auto output_type = circle::TensorType::TensorType_FLOAT32;
+  const auto output_type = circle::TensorType::TensorType_INT32;
   std::vector<float> axis_data{4};
   uint32_t axis_buf = cgen.addBuffer(axis_data);
   int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
   int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, output_type);
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_type)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_type);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
@@ -224,16 +233,20 @@ TEST_F(GenModelTest, neg_OneOp_ArgMax_OutType)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_ArgMax_paramType)
+TEST_P(ArgMinMaxVariation, neg_paramType)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
   const auto output_type = circle::TensorType::TensorType_INT32;
+  const auto output_param = circle::TensorType::TensorType_INT64;
   std::vector<int32_t> axis_data{4};
   uint32_t axis_buf = cgen.addBuffer(axis_data);
   int axis = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, axis_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.input_type}, param.scale, param.zero_point);
   int out = cgen.addTensor({{1, 2, 1}, output_type});
-  cgen.addOperatorArgMax({{in, axis}, {out}}, circle::TensorType::TensorType_INT64);
+  param.is_argmax ? cgen.addOperatorArgMax({{in, axis}, {out}}, output_param)
+                  : cgen.addOperatorArgMin({{in, axis}, {out}}, output_param);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
diff --git a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
index 2fb1d6898..15ddac210 100644
--- a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
@@ -42,27 +42,6 @@ class AveragePool2DVariation : public GenModelTest,
 {
 };
 
-TEST_P(AveragePool2DVariation, Test)
-{
-  auto &param = GetParam();
-  CircleGen cgen;
-
-  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
-                          param.type.zero_point);
-  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
-                           param.type.zero_point);
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w,
-                                param.param.stride_h, param.param.filter_w, param.param.filter_h,
-                                circle::ActivationFunctionType_NONE);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(param.tcd);
-  _context->setBackends(param.backend);
-
-  SUCCEED();
-}
-
 // Test with different input type and value
 INSTANTIATE_TEST_CASE_P(
   GenModelTest, AveragePool2DVariation,
@@ -108,6 +87,27 @@ INSTANTIATE_TEST_CASE_P(
       {circle::TensorType::TensorType_INT8, 2.0, -1},
       {"cpu"}}));
 
+TEST_P(AveragePool2DVariation, Test)
+{
+  auto &param = GetParam();
+  CircleGen cgen;
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w,
+                                param.param.stride_h, param.param.filter_w, param.param.filter_h,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends(param.backend);
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, neg_OneOp_AvgPool2D_3DInput)
 {
   // 3D Tensors are not supported
@@ -142,13 +142,18 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D_2DInput)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidPaddingType)
+TEST_P(AveragePool2DVariation, neg_InvalidPaddingType)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, static_cast<circle::Padding>(99), 2, 2, 2, 2,
-                                circle::ActivationFunctionType_NONE);
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, static_cast<circle::Padding>(99),
+                                param.param.stride_w, param.param.stride_h, param.param.filter_w,
+                                param.param.filter_h, circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
@@ -157,12 +162,17 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidPaddingType)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_1)
+TEST_P(AveragePool2DVariation, neg_InvalidFilterSize_1)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, -1, 2,
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w,
+                                param.param.stride_h, -1, param.param.filter_h,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
@@ -172,12 +182,17 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_1)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_2)
+TEST_P(AveragePool2DVariation, neg_InvalidFilterSize_2)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, 2, 0,
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w,
+                                param.param.stride_h, param.param.filter_w, 0,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
@@ -187,12 +202,17 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidFilterSize_2)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidStrides_1)
+TEST_P(AveragePool2DVariation, neg_InvalidStrides_1)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 0, 2, 2, 2,
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 0, param.param.stride_h,
+                                param.param.filter_w, param.param.filter_h,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
@@ -202,12 +222,17 @@ TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidStrides_1)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_AvgPool2D_InvalidStrides_2)
+TEST_P(AveragePool2DVariation, neg_InvalidStrides_2)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 1, -100, 2, 2,
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w, -100,
+                                param.param.filter_w, param.param.filter_h,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
diff --git a/tests/nnfw_api/src/one_op_tests/Concat.cc b/tests/nnfw_api/src/one_op_tests/Concat.cc
index 6e2435965..f4397ba66 100644
--- a/tests/nnfw_api/src/one_op_tests/Concat.cc
+++ b/tests/nnfw_api/src/one_op_tests/Concat.cc
@@ -59,25 +59,6 @@ class ConcatVariation : public GenModelTest,
 
 // Input shape: {2, 3} / {2, 3}
 // Output shape: {4, 3}
-TEST_P(ConcatVariation, Test)
-{
-  auto &param = GetParam();
-
-  CircleGen cgen;
-  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
-  int input2 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
-  int output = cgen.addTensor({{4, 3}, param.type}, param.scale, param.zero_point);
-  cgen.addOperatorConcatenation({{input1, input2}, {output}}, 0,
-                                circle::ActivationFunctionType_NONE);
-  cgen.setInputsAndOutputs({input1, input2}, {output});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(param.tcd);
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-
-  SUCCEED();
-}
-
 INSTANTIATE_TEST_CASE_P(
   GenModelTest, ConcatVariation,
   ::testing::Values(
@@ -107,6 +88,25 @@ INSTANTIATE_TEST_CASE_P(
                                              {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}),
                          circle::TensorType::TensorType_INT64}));
 
+TEST_P(ConcatVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int input2 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int output = cgen.addTensor({{4, 3}, param.type}, param.scale, param.zero_point);
+  cgen.addOperatorConcatenation({{input1, input2}, {output}}, 0,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({input1, input2}, {output});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, OneOp_Concat_Subtensor_4D)
 {
   CircleGen cgen;
@@ -180,13 +180,14 @@ TEST_F(GenModelTest, OneOp_Concat_Subtensor_4D)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Concat_InvalidAxis)
+TEST_P(ConcatVariation, neg_InvalidAxis)
 {
-  CircleGen cgen;
+  auto &param = GetParam();
 
-  int input1 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int input2 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int output = cgen.addTensor({{4, 3}, circle::TensorType::TensorType_FLOAT32});
+  CircleGen cgen;
+  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int input2 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int output = cgen.addTensor({{4, 3}, param.type}, param.scale, param.zero_point);
   int axis = 2;
 
   cgen.addOperatorConcatenation({{input1, input2}, {output}}, axis,
@@ -200,13 +201,14 @@ TEST_F(GenModelTest, neg_OneOp_Concat_InvalidAxis)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Concat_InvalidRank)
+TEST_P(ConcatVariation, neg_InvalidRank)
 {
-  CircleGen cgen;
+  auto &param = GetParam();
 
-  int input1 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int input2 = cgen.addTensor({{1, 2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int output = cgen.addTensor({{1, 2, 3}, circle::TensorType::TensorType_FLOAT32});
+  CircleGen cgen;
+  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int input2 = cgen.addTensor({{1, 2, 3}, param.type}, param.scale, param.zero_point);
+  int output = cgen.addTensor({{1, 4, 3}, param.type}, param.scale, param.zero_point);
   int axis = 0;
 
   cgen.addOperatorConcatenation({{input1, input2}, {output}}, axis,
@@ -220,13 +222,14 @@ TEST_F(GenModelTest, neg_OneOp_Concat_InvalidRank)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Concat_InvalidDimension)
+TEST_P(ConcatVariation, neg_InvalidDimension)
 {
-  CircleGen cgen;
+  auto &param = GetParam();
 
-  int input1 = cgen.addTensor({{2, 3}, circle::TensorType::TensorType_FLOAT32});
-  int input2 = cgen.addTensor({{3, 2}, circle::TensorType::TensorType_FLOAT32});
-  int output = cgen.addTensor({{4, 3}, circle::TensorType::TensorType_FLOAT32});
+  CircleGen cgen;
+  int input1 = cgen.addTensor({{2, 3}, param.type}, param.scale, param.zero_point);
+  int input2 = cgen.addTensor({{3, 2}, param.type}, param.scale, param.zero_point);
+  int output = cgen.addTensor({{4, 3}, param.type}, param.scale, param.zero_point);
   int axis = 0;
 
   cgen.addOperatorConcatenation({{input1, input2}, {output}}, axis,
diff --git a/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc b/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc
index 9f563401f..a4fe88493 100644
--- a/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc
+++ b/tests/nnfw_api/src/one_op_tests/DepthToSpace.cc
@@ -29,6 +29,9 @@ class DepthToSpaceVariation : public GenModelTest,
 {
 };
 
+// Input shape: {1, 1, 2, 4}
+// Block size: 2
+// Output shape: {1, 2, 4, 1}
 INSTANTIATE_TEST_CASE_P(
   GenModelTest, DepthToSpaceVariation,
   ::testing::Values(
@@ -52,9 +55,6 @@ INSTANTIATE_TEST_CASE_P(
       uniformTCD<int8_t>({{1, 2, 3, 4, 5, 6, 7, 8}}, {{1, 2, 5, 6, 3, 4, 7, 8}}),
       circle::TensorType::TensorType_INT8, 1.0f, -2}));
 
-// Input shape: {1, 1, 2, 4}
-// Block size: 2
-// Output shape: {1, 2, 4, 1}
 TEST_P(DepthToSpaceVariation, Test)
 {
   auto &param = GetParam();
@@ -72,12 +72,13 @@ TEST_P(DepthToSpaceVariation, Test)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_DepthToSpace_Blocksize)
+TEST_P(DepthToSpaceVariation, neg_Blocksize)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
-  int in = cgen.addTensor({{1, 1, 2, 4}, data_type});
-  int out = cgen.addTensor({{1, 2, 4, 1}, data_type});
+  int in = cgen.addTensor({{1, 1, 2, 4}, param.type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({{1, 2, 4, 1}, param.type}, param.scale, param.zero_point);
   cgen.addOperatorDepthToSpace({{in}, {out}}, -2);
   cgen.setInputsAndOutputs({in}, {out});
 
diff --git a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
index 658c44cb9..a0bdbf9e6 100644
--- a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
@@ -257,50 +257,6 @@ class DepthwiseConv2DQuantTest
 using DepthwiseConv2DQuantTestParamU8 = DepthwiseConv2DQuantTestParam<uint8_t>;
 using DepthwiseConv2DQuantTestU8 = DepthwiseConv2DQuantTest<uint8_t>;
 
-CircleBuffer genDepthwiseConv2DQuantU8Model(int stride, int input_depth, int depth_multiplier)
-{
-  assert(1 <= stride && stride <= 2);
-  assert(1 <= input_depth && input_depth <= 16);
-  assert(1 <= depth_multiplier && depth_multiplier <= 32);
-
-  const int output_depth = input_depth * depth_multiplier;
-  assert(1 <= output_depth && output_depth <= 32);
-
-  CircleGen cgen;
-  uint32_t ker_buf = cgen.addBuffer(std::vector<uint8_t>{
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
-  uint32_t bias_buf = cgen.addBuffer(std::vector<int32_t>(output_depth, 0));
-  int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_UINT8}, 0.5, 0);
-  int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_UINT8, ker_buf}, 0.5, 0);
-  int bias = cgen.addTensor({{output_depth}, circle::TensorType_INT32, bias_buf}, 0.25, 0);
-  int out = cgen.addTensor({{1, 1, 1, output_depth}, circle::TensorType_UINT8}, 1, 0);
-  cgen.addOperatorDepthwiseConv2D({{in, ker, bias}, {out}}, circle::Padding::Padding_VALID, stride,
-                                  stride, depth_multiplier, circle::ActivationFunctionType_NONE);
-  cgen.setInputsAndOutputs({in}, {out});
-  return cgen.finish();
-}
-
-TEST_P(DepthwiseConv2DQuantTestU8, Test)
-{
-  // Same input is used for all tests but output differs
-  static const std::vector<uint8_t> input64{
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
-    2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
-
-  auto &param = GetParam();
-  _context = std::make_unique<GenModelTestContext>(
-    genDepthwiseConv2DQuantU8Model(param.stride, param.input_depth, param.depth_multiplier));
-  std::vector<uint8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
-  _context->addTestCase(uniformTCD<uint8_t>({ref_input}, {param.ref_output}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-
-  SUCCEED();
-}
-
 // Test with different InputDepth and DepthMultiplier. The values are intended to test optimized CPU
 // kernels.
 INSTANTIATE_TEST_CASE_P(
@@ -337,10 +293,7 @@ INSTANTIATE_TEST_CASE_P(
     DepthwiseConv2DQuantTestParamU8{
       2, 16, 1, std::vector<uint8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
 
-using DepthwiseConv2DQuantTestParamI8 = DepthwiseConv2DQuantTestParam<int8_t>;
-using DepthwiseConv2DQuantTestI8 = DepthwiseConv2DQuantTest<int8_t>;
-
-CircleBuffer genDepthwiseConv2DQuantI8Model(int stride, int input_depth, int depth_multiplier)
+CircleBuffer genDepthwiseConv2DQuantU8Model(int stride, int input_depth, int depth_multiplier)
 {
   assert(1 <= stride && stride <= 2);
   assert(1 <= input_depth && input_depth <= 16);
@@ -350,40 +303,43 @@ CircleBuffer genDepthwiseConv2DQuantI8Model(int stride, int input_depth, int dep
   assert(1 <= output_depth && output_depth <= 32);
 
   CircleGen cgen;
-  uint32_t ker_buf = cgen.addBuffer(std::vector<int8_t>{
+  uint32_t ker_buf = cgen.addBuffer(std::vector<uint8_t>{
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
     2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
     2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
   uint32_t bias_buf = cgen.addBuffer(std::vector<int32_t>(output_depth, 0));
-  int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_INT8}, 0.5, 0);
-  int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_INT8, ker_buf}, 0.5, 0);
+  int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_UINT8}, 0.5, 0);
+  int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_UINT8, ker_buf}, 0.5, 0);
   int bias = cgen.addTensor({{output_depth}, circle::TensorType_INT32, bias_buf}, 0.25, 0);
-  int out = cgen.addTensor({{1, 1, 1, output_depth}, circle::TensorType_INT8}, 1, 0);
+  int out = cgen.addTensor({{1, 1, 1, output_depth}, circle::TensorType_UINT8}, 1, 0);
   cgen.addOperatorDepthwiseConv2D({{in, ker, bias}, {out}}, circle::Padding::Padding_VALID, stride,
                                   stride, depth_multiplier, circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
   return cgen.finish();
 }
 
-TEST_P(DepthwiseConv2DQuantTestI8, Test)
+TEST_P(DepthwiseConv2DQuantTestU8, Test)
 {
   // Same input is used for all tests but output differs
-  static const std::vector<int8_t> input64{
+  static const std::vector<uint8_t> input64{
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
     2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
 
   auto &param = GetParam();
   _context = std::make_unique<GenModelTestContext>(
-    genDepthwiseConv2DQuantI8Model(param.stride, param.input_depth, param.depth_multiplier));
-  std::vector<int8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
-  _context->addTestCase(uniformTCD<int8_t>({ref_input}, {param.ref_output}));
+    genDepthwiseConv2DQuantU8Model(param.stride, param.input_depth, param.depth_multiplier));
+  std::vector<uint8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
+  _context->addTestCase(uniformTCD<uint8_t>({ref_input}, {param.ref_output}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
+using DepthwiseConv2DQuantTestParamI8 = DepthwiseConv2DQuantTestParam<int8_t>;
+using DepthwiseConv2DQuantTestI8 = DepthwiseConv2DQuantTest<int8_t>;
+
 // Test with different InputDepth and DepthMultiplier. The values are intended to test optimized CPU
 // kernels.
 INSTANTIATE_TEST_CASE_P(
@@ -420,6 +376,50 @@ INSTANTIATE_TEST_CASE_P(
     DepthwiseConv2DQuantTestParamI8{
       2, 16, 1, std::vector<int8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
 
+CircleBuffer genDepthwiseConv2DQuantI8Model(int stride, int input_depth, int depth_multiplier)
+{
+  assert(1 <= stride && stride <= 2);
+  assert(1 <= input_depth && input_depth <= 16);
+  assert(1 <= depth_multiplier && depth_multiplier <= 32);
+
+  const int output_depth = input_depth * depth_multiplier;
+  assert(1 <= output_depth && output_depth <= 32);
+
+  CircleGen cgen;
+  uint32_t ker_buf = cgen.addBuffer(std::vector<int8_t>{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+  uint32_t bias_buf = cgen.addBuffer(std::vector<int32_t>(output_depth, 0));
+  int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_INT8}, 0.5, 0);
+  int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_INT8, ker_buf}, 0.5, 0);
+  int bias = cgen.addTensor({{output_depth}, circle::TensorType_INT32, bias_buf}, 0.25, 0);
+  int out = cgen.addTensor({{1, 1, 1, output_depth}, circle::TensorType_INT8}, 1, 0);
+  cgen.addOperatorDepthwiseConv2D({{in, ker, bias}, {out}}, circle::Padding::Padding_VALID, stride,
+                                  stride, depth_multiplier, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen.finish();
+}
+
+TEST_P(DepthwiseConv2DQuantTestI8, Test)
+{
+  // Same input is used for all tests but output differs
+  static const std::vector<int8_t> input64{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
+    2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
+
+  auto &param = GetParam();
+  _context = std::make_unique<GenModelTestContext>(
+    genDepthwiseConv2DQuantI8Model(param.stride, param.input_depth, param.depth_multiplier));
+  std::vector<int8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
+  _context->addTestCase(uniformTCD<int8_t>({ref_input}, {param.ref_output}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_InvalidPaddingType)
 {
   _context = std::make_unique<GenModelTestContext>(genNegTestDepthwiseConv2DModel(
diff --git a/tests/nnfw_api/src/one_op_tests/DetectionPostProcess.cc b/tests/nnfw_api/src/one_op_tests/DetectionPostProcess.cc
new file mode 100644
index 000000000..188638bbb
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/DetectionPostProcess.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_DetectionPostProcess_SingleBox)
+{
+  CircleGen cgen;
+
+  int boxes = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int scores = cgen.addTensor({{1, 1, 2}, circle::TensorType::TensorType_FLOAT32});
+  int anchors = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+
+  int box_coors = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int box_classes = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int box_scores = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int num_selected = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorDetectionPostProcess(
+    {{boxes, scores, anchors}, {box_coors, box_classes, box_scores, num_selected}}, 1, 10, 10, 5, 5,
+    0.8, 0.5, 1, 1, 1);
+  cgen.setInputsAndOutputs({boxes, scores, anchors},
+                           {box_coors, box_classes, box_scores, num_selected});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{0, 0, 0, 0}, {0, 0.9}, {0, 0, 1, 1}},
+                                          {{-0.5, -0.5, 0.5, 0.5}, {0}, {0.9}, {1}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_DetectionPostProcess_SinglBox_MultiClasses)
+{
+  CircleGen cgen;
+
+  int boxes = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int scores = cgen.addTensor({{1, 1, 3}, circle::TensorType::TensorType_FLOAT32});
+  int anchors = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+
+  int box_coors = cgen.addTensor({{1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int box_classes = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int box_scores = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+  int num_selected = cgen.addTensor({{1}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorDetectionPostProcess(
+    {{boxes, scores, anchors}, {box_coors, box_classes, box_scores, num_selected}}, 2, 10, 10, 5, 5,
+    0.8, 0.5, 1, 1, 1);
+  cgen.setInputsAndOutputs({boxes, scores, anchors},
+                           {box_coors, box_classes, box_scores, num_selected});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{0, 0, 0, 0}, {0, 0.7, 0.9}, {0, 0, 1, 1}},
+                                          {{-0.5, -0.5, 0.5, 0.5}, {1}, {0.9}, {1}}));
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Pad.cc b/tests/nnfw_api/src/one_op_tests/Pad.cc
index 42971da79..c376c1c02 100644
--- a/tests/nnfw_api/src/one_op_tests/Pad.cc
+++ b/tests/nnfw_api/src/one_op_tests/Pad.cc
@@ -31,6 +31,21 @@ class PadVariation : public GenModelTest, public ::testing::WithParamInterface<P
 {
 };
 
+// Test with different value type
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, PadVariation,
+  ::testing::Values(
+    // float value
+    PadParam{uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}})},
+    // uint8 value
+    PadParam{
+      uniformTCD<uint8_t>({{1, 2, 3, 4}}, {{8, 8, 8, 8, 8, 1, 2, 8, 8, 3, 4, 8, 8, 8, 8, 8}}),
+      circle::TensorType::TensorType_UINT8, 1.0, 8},
+    // int8 value
+    PadParam{uniformTCD<int8_t>({{-2, -1, 1, 2}},
+                                {{-5, -5, -5, -5, -5, -2, -1, -5, -5, 1, 2, -5, -5, -5, -5, -5}}),
+             circle::TensorType::TensorType_INT8, 1.0, -5}));
+
 TEST_P(PadVariation, Test)
 {
   auto &param = GetParam();
@@ -51,29 +66,16 @@ TEST_P(PadVariation, Test)
   SUCCEED();
 }
 
-// Test with different value type
-INSTANTIATE_TEST_CASE_P(
-  GenModelTest, PadVariation,
-  ::testing::Values(
-    // float value
-    PadParam{uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}})},
-    // uint8 value
-    PadParam{
-      uniformTCD<uint8_t>({{1, 2, 3, 4}}, {{8, 8, 8, 8, 8, 1, 2, 8, 8, 3, 4, 8, 8, 8, 8, 8}}),
-      circle::TensorType::TensorType_UINT8, 1.0, 8},
-    // int8 value
-    PadParam{uniformTCD<int8_t>({{-2, -1, 1, 2}},
-                                {{-5, -5, -5, -5, -5, -2, -1, -5, -5, 1, 2, -5, -5, -5, -5, -5}}),
-             circle::TensorType::TensorType_INT8, 1.0, -5}));
-
-TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadRank)
+TEST_P(PadVariation, neg_InvalidPadRank)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
   std::vector<int32_t> padding_data{1, 1, 1, 1};
   uint32_t padding_buf = cgen.addBuffer(padding_data);
   int padding = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, padding_buf});
-  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 4, 4, 1}, param.data_type}, param.scale, param.zero_point);
 
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
@@ -85,14 +87,16 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadRank)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim0)
+TEST_P(PadVariation, neg_InvalidPadDim0)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
   std::vector<int32_t> padding_data{1, 1, 1, 1};
   uint32_t padding_buf = cgen.addBuffer(padding_data);
   int padding = cgen.addTensor({{2, 2}, circle::TensorType::TensorType_INT32, padding_buf});
-  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 4, 4, 1}, param.data_type}, param.scale, param.zero_point);
 
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
@@ -104,14 +108,16 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim0)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim1)
+TEST_P(PadVariation, neg_InvalidPadDim1)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
   std::vector<int32_t> padding_data{1, 1, 1, 1};
   uint32_t padding_buf = cgen.addBuffer(padding_data);
   int padding = cgen.addTensor({{4, 1}, circle::TensorType::TensorType_INT32, padding_buf});
-  int out = cgen.addTensor({{2, 2, 2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 4, 4, 1}, param.data_type}, param.scale, param.zero_point);
 
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
@@ -123,14 +129,20 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim1)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Pad_Type)
+TEST_P(PadVariation, neg_Type)
 {
+  auto &param = GetParam();
+
+  const circle::TensorType output_type = ((param.data_type == circle::TensorType::TensorType_UINT8)
+                                            ? circle::TensorType::TensorType_INT8
+                                            : circle::TensorType::TensorType_UINT8);
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
   std::vector<int32_t> padding_data{0, 0, 1, 1, 1, 1, 0, 0};
   uint32_t padding_buf = cgen.addBuffer(padding_data);
   int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
-  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 1);
+  int out = cgen.addTensor({{1, 4, 4, 1}, output_type}, 1.0, 0);
 
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
diff --git a/tests/nnfw_api/src/one_op_tests/Slice.cc b/tests/nnfw_api/src/one_op_tests/Slice.cc
index 960cd88e3..002fb0132 100644
--- a/tests/nnfw_api/src/one_op_tests/Slice.cc
+++ b/tests/nnfw_api/src/one_op_tests/Slice.cc
@@ -34,6 +34,32 @@ class SliceVariation : public GenModelTest,
 {
 };
 
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, SliceVariation,
+  ::testing::Values(
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}})},
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<uint8_t>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
+      circle::TensorType::TensorType_UINT8,
+      1,
+      0},
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
+      circle::TensorType::TensorType_FLOAT32,
+      0,
+      0,
+      circle::TensorType::TensorType_INT64}));
+
 TEST_P(SliceVariation, Test)
 {
   auto &param = GetParam();
@@ -90,32 +116,6 @@ TEST_P(SliceVariation, Test)
   SUCCEED();
 }
 
-INSTANTIATE_TEST_CASE_P(
-  GenModelTest, SliceVariation,
-  ::testing::Values(
-    SliceVariationParam{
-      {2, 2, 3, 1},
-      {0, 1, 1, 0},
-      {1, 1, 2, 1},
-      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}})},
-    SliceVariationParam{
-      {2, 2, 3, 1},
-      {0, 1, 1, 0},
-      {1, 1, 2, 1},
-      uniformTCD<uint8_t>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
-      circle::TensorType::TensorType_UINT8,
-      1,
-      0},
-    SliceVariationParam{
-      {2, 2, 3, 1},
-      {0, 1, 1, 0},
-      {1, 1, 2, 1},
-      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
-      circle::TensorType::TensorType_FLOAT32,
-      0,
-      0,
-      circle::TensorType::TensorType_INT64}));
-
 TEST_F(GenModelTest, neg_OneOp_Slice_Type)
 {
   CircleGen cgen;
@@ -136,18 +136,48 @@ TEST_F(GenModelTest, neg_OneOp_Slice_Type)
   SUCCEED();
 }
 
-TEST_F(GenModelTest, neg_OneOp_Slice_DiffType)
+TEST_P(SliceVariation, neg_DiffType)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
-  std::vector<int32_t> begins_data = {0, 0, 1, 0};
-  uint32_t begins_buf = cgen.addBuffer(begins_data);
-  int begins = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, begins_buf});
-  std::vector<int64_t> sizes_data = {1, 2, 1, 1};
-  uint32_t sizes_buf = cgen.addBuffer(sizes_data);
-  int sizes = cgen.addTensor({{4}, circle::TensorType::TensorType_INT64, sizes_buf});
-  int out = cgen.addTensor({{1, 2, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+
+  int in = cgen.addTensor({param.input_shape, param.input_type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({param.sizes, param.input_type}, param.scale, param.zero_point);
+  if (param.begins_type == circle::TensorType::TensorType_INT32)
+  {
+    uint32_t begins_buf = cgen.addBuffer(param.begins);
+    std::vector<int64_t> sizes_64(param.sizes.size());
+    for (int i = 0; i < param.begins.size(); i++)
+    {
+      sizes_64[i] = param.sizes[i];
+    }
+
+    int rank = param.begins.size();
+    int begins = cgen.addTensor({{rank}, param.begins_type, begins_buf});
+
+    uint32_t sizes_buf = cgen.addBuffer(sizes_64);
+    int sizes = cgen.addTensor({{rank}, circle::TensorType::TensorType_INT64, sizes_buf});
+
+    cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  }
+  else if (param.begins_type == circle::TensorType::TensorType_INT64)
+  {
+    std::vector<int64_t> begins_64(param.begins.size());
+    for (int i = 0; i < param.begins.size(); i++)
+    {
+      begins_64[i] = param.begins[i];
+    }
+
+    uint32_t begins_buf = cgen.addBuffer(begins_64);
+    int rank = param.begins.size();
+    int begins = cgen.addTensor({{rank}, param.begins_type, begins_buf});
+
+    uint32_t sizes_buf = cgen.addBuffer(param.sizes);
+    int sizes = cgen.addTensor({{rank}, circle::TensorType::TensorType_INT32, sizes_buf});
+
+    cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  }
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
diff --git a/tests/nnfw_api/src/one_op_tests/Softmax.cc b/tests/nnfw_api/src/one_op_tests/Softmax.cc
index 95debec33..aba4e89a0 100644
--- a/tests/nnfw_api/src/one_op_tests/Softmax.cc
+++ b/tests/nnfw_api/src/one_op_tests/Softmax.cc
@@ -30,6 +30,23 @@ class SoftmaxVariation : public GenModelTest, public ::testing::WithParamInterfa
 {
 };
 
+// Test with different value type
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, SoftmaxVariation,
+  ::testing::Values(
+    // float value
+    SoftmaxParam{
+      uniformTCD<float>({{0, -6, 2, 4, 3, -2, 10, 1}},
+                        {{.23463, .12877, .28658, .35003, .22528, .13664, .45365, .18443}})},
+    // uint8 value
+    SoftmaxParam{
+      uniformTCD<uint8_t>({{10, 4, 12, 14, 13, 8, 20, 11}}, {{60, 33, 73, 90, 58, 35, 116, 47}}),
+      circle::TensorType::TensorType_UINT8, 1.0, 10},
+    // int8 value
+    SoftmaxParam{
+      uniformTCD<int8_t>({{0, -6, 2, 4, 3, -2, 10, 1}}, {{-68, -95, -55, -38, -70, -93, -12, -81}}),
+      circle::TensorType::TensorType_INT8, 1.0, 0}));
+
 TEST_P(SoftmaxVariation, Test)
 {
   auto &param = GetParam();
@@ -95,28 +112,14 @@ TEST_F(GenModelTest, OneOp_Softmax)
   SUCCEED();
 }
 
-// Test with different value type
-INSTANTIATE_TEST_CASE_P(
-  GenModelTest, SoftmaxVariation,
-  ::testing::Values(
-    // float value
-    SoftmaxParam{
-      uniformTCD<float>({{0, -6, 2, 4, 3, -2, 10, 1}},
-                        {{.23463, .12877, .28658, .35003, .22528, .13664, .45365, .18443}})},
-    // uint8 value
-    SoftmaxParam{
-      uniformTCD<uint8_t>({{10, 4, 12, 14, 13, 8, 20, 11}}, {{60, 33, 73, 90, 58, 35, 116, 47}}),
-      circle::TensorType::TensorType_UINT8, 1.0, 10},
-    // int8 value
-    SoftmaxParam{
-      uniformTCD<int8_t>({{0, -6, 2, 4, 3, -2, 10, 1}}, {{-68, -95, -55, -38, -70, -93, -12, -81}}),
-      circle::TensorType::TensorType_INT8, 1.0, 0}));
-
-TEST_F(GenModelTest, neg_OneOp_Softmax_Type)
+TEST_P(SoftmaxVariation, neg_Type)
 {
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int input = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_INT8}, 1.0, 0);
+  int input =
+    cgen.addTensor({{1, 2, 1, 4}, param.data_type}, param.input_scale, param.input_zero_point);
+  int out = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_BOOL});
   cgen.addOperatorSoftmax({{input}, {out}}, 0.1);
   cgen.setInputsAndOutputs({input}, {out});
 
diff --git a/tools/release_tool/onert_version.sh b/tools/release_tool/onert_version.sh
index 5c875e38b..374a58acf 100755
--- a/tools/release_tool/onert_version.sh
+++ b/tools/release_tool/onert_version.sh
@@ -27,7 +27,7 @@ show_version() {
   current_version=${version_line#"Version:"}
 
   if [ $nightly -eq 0 ]; then
-    echo $current_version~$(date "+%y%m%d%H")
+    echo $current_version~$(date -u "+%y%m%d%H")
   else
     echo $current_version
   fi
author	Chunseok Lee <chunseok.lee@samsung.com>	2021-10-19 11:32:46 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2021-10-19 11:32:46 +0900
commit	33ae5d70a1ed85d215c1293ed63afbf3517b07d5 (patch)
tree	9f1ace0f4760a8f7903ef15e2e92f1d1401e4b1e
parent	f4cf19e579a19c5346ccb2aad55bfd251065e447 (diff)
download	nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.tar.gz nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.tar.bz2 nnfw-33ae5d70a1ed85d215c1293ed63afbf3517b07d5.zip